In [7]:
import pandas as pd
import re
from collections import Counter


In [8]:
# Merge text and label df
text_df = pd.read_json("raw_data/train_text.json",lines=True)
label_df = pd.read_json("raw_data/train_truth.json", lines=True)

df = pd.merge(text_df,label_df, on='twitter user id').drop(columns = ['tweet ids'])

In [9]:
# Expand the dataframe
expanded_rows = []
for index, row in df.iterrows():
    for text_dict in row['texts']:
        new_row = row.copy()
        new_row['texts'] = text_dict['text']
        expanded_rows.append(new_row)

df = pd.DataFrame(expanded_rows)

In [10]:
def preprocess_tweet(tweet):
    # Preprocess text (username and link placeholders)
    new_text = []
    for t in tweet.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    tweet = " ".join(new_text)

    # remove spaces
    tweet = tweet.strip()

    # remove new line character
    tweet = re.sub(r'\n','', tweet)
    
    # return mention count
    mentions = re.findall(r'@\w+', tweet)
    tweet = re.sub(r'@\w+', '', tweet)
    count_mentions = len(mentions)

    # return link count
    links = re.findall(r'http\w+', tweet)
    tweet = re.sub(r'http\w+', '', tweet)
    count_links = len(links)
    
    return tweet, count_mentions, count_links

In [11]:
# Apply the preprocessing function to the 'tweet' column
df['texts'], df['count_mention'], df['count_link'] = zip(*df['texts'].apply(preprocess_tweet))

# Remove duplicate tweets based on the 'tweet_processed' column
df = df.drop_duplicates(subset=['texts'])

def count_words(text):
    return len(text.split())

# Count the number of words in each row of the specified column
df['word_count'] = df['texts'].apply(count_words)

# Remove rows with less than 5 words in the specified column
df = df[df['word_count'] >= 5]

# Drop the 'word_count' column as it's no longer needed
df = df.drop(columns=['word_count'])

# Save the preprocessed DataFrame to a new CSV file
df.to_csv('data/preprocessed_data_finetune.csv', index=False)

In [12]:
df

Unnamed: 0,twitter user id,texts,class,count_mention,count_link
0,0037a672f0ed64b3231bac64853a278d,RT #Ape $ApeGive some of the Apes that are tw...,nano,1,0
0,0037a672f0ed64b3231bac64853a278d,I can’t see a single valid reason to sell $APE,nano,1,0
1,03eaa72711143b521c073d9ac5745923,RT Now we just need to knock our heels togeth...,nano,1,0
1,03eaa72711143b521c073d9ac5745923,☝️would love some input from #tezos #cleannft ...,nano,0,0
1,03eaa72711143b521c073d9ac5745923,RT Fresh drop 💜- 3 Men Please -the more the m...,nano,1,2
...,...,...,...,...,...
159,fec182516cba4b665e2215094bbcc527,RT Ok giving away 0.5 $SOL and 5 Discord invi...,nano,1,0
159,fec182516cba4b665e2215094bbcc527,RT Is not $SOL vs $ETH Is $SOL 🤝 $ETH,nano,1,0
159,fec182516cba4b665e2215094bbcc527,RT 🎁 Vanguards x Yaku Corp 🎁 Prizes:1 x Vangu...,nano,2,0
159,fec182516cba4b665e2215094bbcc527,RT Giving away a y00ts t00b NFT 🔮Floor price ...,nano,1,0
