# Cleaning the Tweets

In [69]:
# Data manipulation and analysis
import pandas as pd

# Text preprocessing
import re

# Natural Language Processing
import spacy

In [70]:
file_path = "C:/Users/kunve/OneDrive/Desktop/Projects/twitter_sentiment_analysis_LR_NB/data/raw/sentiment_raw_data.csv"
columns = ["target", "id", "date", "flag", "user", "text"]
data_df = pd.read_csv(file_path, encoding="latin-1", names=columns)

In [71]:
df = data_df.sample(n=10000, random_state=42).copy()

### Text Cleanup

In [72]:
#Cleanup function
def lower_replace(series):
    output = series.str.lower() #Lowercase
    output = output.str.replace(r'@\w+', '', regex=True) # Remove mentions
    output = output.str.replace(r'http[s]?://\S+|www\.\S+', '', regex=True) # Remove URLs
    output = output.str.replace(r'@\S+', '', regex=True) #Remove mentions
    output = output.str.replace(r'#\S+', '', regex=True) #Remove Hashtags
    output = output.str.replace(r'[0-9]', '', regex=True) #Remove Numbers
    output = output.str.replace(r'(\(.*\))|(\[.*\])', '', regex=True) #Remove text in brackets ([...] or(...))
    output = output.str.replace(r'<[^>]+>', '', regex=True) #Remove HTML tags
    output = output.str.replace(r'[^\w\s]', '', regex=True) # Remove punctuation (except for desired characters)
    output = output.str.replace(r'\n', '', regex=True) #Remove line or tab characters (\n, \r, \t..)
    return output

In [73]:
df['text_clean'] = lower_replace(df['text'])
df

Unnamed: 0,target,id,date,flag,user,text,text_clean
541200,0,2200003196,Tue Jun 16 18:18:12 PDT 2009,NO_QUERY,LaLaLindsey0609,@chrishasboobs AHHH I HOPE YOUR OK!!!,ahhh i hope your ok
750,0,1467998485,Mon Apr 06 23:11:14 PDT 2009,NO_QUERY,sexygrneyes,"@misstoriblack cool , i have no tweet apps fo...",cool i have no tweet apps for my razr
766711,0,2300048954,Tue Jun 23 13:40:11 PDT 2009,NO_QUERY,sammydearr,@TiannaChaos i know just family drama. its la...,i know just family drama its lamehey next ti...
285055,0,1993474027,Mon Jun 01 10:26:07 PDT 2009,NO_QUERY,Lamb_Leanne,School email won't open and I have geography ...,school email wont open and i have geography s...
705995,0,2256550904,Sat Jun 20 12:56:51 PDT 2009,NO_QUERY,yogicerdito,upper airways problem,upper airways problem
...,...,...,...,...,...,...,...
965972,4,1827657080,Sun May 17 11:10:20 PDT 2009,NO_QUERY,Hici96,"My sister, Cheyenne's birthday party is today!...",my sister cheyennes birthday party is today it...
1193436,4,1984262616,Sun May 31 14:45:44 PDT 2009,NO_QUERY,dauria,@addieking you are SO welcome,you are so welcome
896390,4,1693023020,Sun May 03 21:21:45 PDT 2009,NO_QUERY,evankmathews,@RebekahMaylene seriously!! Those gals were ma...,seriously those gals were making me nervous p...
544127,0,2201005638,Tue Jun 16 19:40:27 PDT 2009,NO_QUERY,jimbafrosty,@SarahMascara thanks now i want chicken and ch...,thanks now i want chicken and cheesecake


In [74]:
#Remove EMOJIS
emoji_pattern = re.compile("["
    u"\U0001F600-\U0001F64F"  # emoticons
    u"\U0001F300-\U0001F5FF"  # pictographs
    u"\U0001F680-\U0001F6FF"  # transport/map
    u"\U0001F1E0-\U0001F1FF"  # flags
    u"\U00002500-\U00002BEF"  # Chinese/Japanese characters
    u"\U00002702-\U000027B0"
    u"\U000024C2-\U0001F251"
    u"\U0001f926-\U0001f937"
    u"\U00010000-\U0010ffff"
    u"\u2640-\u2642"
    u"\u2600-\u2B55"
    u"\u200d"
    u"\u23cf"
    u"\u23e9"
    u"\u231a"
    u"\ufe0f"
    u"\u3030"
"]+", flags=re.UNICODE)

df['text_clean'] = df['text_clean'].apply(lambda x: emoji_pattern.sub(r'', x))


### Tokenization, Lemmatization and Stop Words

In [75]:
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])

In [76]:
from tqdm import tqdm

def spacy_preprocess(texts, batch_size=1000, n_process=4):
    """Tokenize, lemmatize, remove stopwords and pronouns."""
    results = []

    for doc in tqdm(nlp.pipe(texts, batch_size=batch_size, n_process=n_process), total=len(texts)):
      tokens = [
         token.lemma_ 
         for token in doc 
         if not token.is_stop and token.is_alpha and token.lemma_ != "-PRON-"
         ]
      results.append(" ".join(tokens))
    
    return results


In [77]:
df["text_clean"] = spacy_preprocess(df["text_clean"].astype(str).values)


100%|██████████| 10000/10000 [00:14<00:00, 674.78it/s]


In [78]:
df

Unnamed: 0,target,id,date,flag,user,text,text_clean
541200,0,2200003196,Tue Jun 16 18:18:12 PDT 2009,NO_QUERY,LaLaLindsey0609,@chrishasboobs AHHH I HOPE YOUR OK!!!,ahhh hope ok
750,0,1467998485,Mon Apr 06 23:11:14 PDT 2009,NO_QUERY,sexygrneyes,"@misstoriblack cool , i have no tweet apps fo...",cool tweet app razr
766711,0,2300048954,Tue Jun 23 13:40:11 PDT 2009,NO_QUERY,sammydearr,@TiannaChaos i know just family drama. its la...,know family drama lamehey time u hang kim n u ...
285055,0,1993474027,Mon Jun 01 10:26:07 PDT 2009,NO_QUERY,Lamb_Leanne,School email won't open and I have geography ...,school email will not open geography stuff rev...
705995,0,2256550904,Sat Jun 20 12:56:51 PDT 2009,NO_QUERY,yogicerdito,upper airways problem,upper airway problem
...,...,...,...,...,...,...,...
965972,4,1827657080,Sun May 17 11:10:20 PDT 2009,NO_QUERY,Hici96,"My sister, Cheyenne's birthday party is today!...",sister cheyenne birthday party today go to fun
1193436,4,1984262616,Sun May 31 14:45:44 PDT 2009,NO_QUERY,dauria,@addieking you are SO welcome,welcome
896390,4,1693023020,Sun May 03 21:21:45 PDT 2009,NO_QUERY,evankmathews,@RebekahMaylene seriously!! Those gals were ma...,seriously gal make nervous party thank come
544127,0,2201005638,Tue Jun 16 19:40:27 PDT 2009,NO_QUERY,jimbafrosty,@SarahMascara thanks now i want chicken and ch...,thank want chicken cheesecake


In [80]:
output_path = "C:/Users/kunve/OneDrive/Desktop/Projects/twitter_sentiment_analysis_LR_NB/data/processed/cleaned.csv"
df.to_csv(output_path, index=False)