In this notebook we are loading the dataset, clean it by removing mentions, hashtags, URLs, punctuation, and numbers and we apply lematization.

In [None]:
import pandas as pd
from google.colab import drive


drive.mount('/content/drive')

df_encoding = "latin-1"

cols = ['sentiment','id','date','query_string','user','original_tweets']

df = pd.read_csv("/content/drive/My Drive/deep-learning/training.1600000.processed.noemoticon.csv", encoding=df_encoding, header=None, names=cols)

df.drop(['id','date','query_string','user'],axis=1,inplace=True)

df = df.sample(frac=1, replace=True, random_state=1)

df.head()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Unnamed: 0,sentiment,original_tweets
128037,0,Oh really don't wanna be awake
491755,0,Trying to amuse my cousin. It's not working! a...
470924,0,@JonasAustralia i wanted to win! congrats to ...
491263,0,That's it!! I can't take it no more!! After su...
836489,4,@beckybootsx i hope your not drinking alcohol!...


In [None]:
df['sentiment'].value_counts()

4    800352
0    799648
Name: sentiment, dtype: int64

## Define usefull functions


In [None]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords 
import re
import string

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

lemmatizer = WordNetLemmatizer()

# set stop words for english language
stop_words = set(stopwords.words("english"))

# function to remove punctuation
def remove_punctuations(text):
    text = ''.join([c for c in text if c not in string.punctuation])
    return text

def remove_stopwords(text):
    list_of_words = [word for word in text.split(' ') if word not in stop_words]
    words_to_text = " ".join(list_of_words)
    return words_to_text

def remove_numbers(text):
    clear_text = ''.join([i for i in text if not i.isdigit()])
    return clear_text

def remove_single_chars(text):
    text = ' '.join([w for w in text.split(' ') if len(w) > 1])
    return text

def do_lem(text):
    text = ' '.join([lemmatizer.lemmatize(w) for w in text.split(' ')])
    return text


def clean_text(text):    
    # regex dictionary
    regex = {
        "urls": r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))",
        "mentions": r"@[A-Za-z0-9]+",
        "hashtags": r"#[A-Za-z0-9]+",
        "contains_@": "\S*@\S*\s?",
        "whitespaces": "\s+"
    }
    text = str(text).lower()
    text = re.sub(regex['urls'], '', text)
    text = re.sub(regex['mentions'], '', text)
    text = re.sub(regex['contains_@'], '', text)
    text = re.sub(regex['hashtags'], '', text)
    #text = remove_stopwords(text) 
    text = remove_punctuations(text)
    text = remove_numbers(text)
    text = re.sub(regex['whitespaces'], ' ', text).strip()
    #text = remove_single_chars(text)
    text = do_lem(text)

    return text

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


## Data Cleaning

In [None]:
# clean text
df['tweets'] = df['original_tweets'].apply(clean_text)

In [None]:
# transform labels to 0 for negative and 1 for positive
df.sentiment = df.sentiment.replace({0: 0, 4: 1})

df.head()

Unnamed: 0,sentiment,original_tweets,tweets
128037,0,Oh really don't wanna be awake,oh really dont wanna be awake
491755,0,Trying to amuse my cousin. It's not working! a...,trying to amuse my cousin it not working and h...
470924,0,@JonasAustralia i wanted to win! congrats to ...,i wanted to win congrats to her anyways
491263,0,That's it!! I can't take it no more!! After su...,thats it i cant take it no more after summer s...
836489,1,@beckybootsx i hope your not drinking alcohol!...,i hope your not drinking alcohol lol


In [None]:
# remove original tweets
df.drop(['original_tweets'], axis=1, inplace=True)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1600000 entries, 128037 to 1034136
Data columns (total 2 columns):
 #   Column     Non-Null Count    Dtype 
---  ------     --------------    ----- 
 0   sentiment  1600000 non-null  int64 
 1   tweets     1600000 non-null  object
dtypes: int64(1), object(1)
memory usage: 36.6+ MB


In [None]:
# remove tweets with 1 character or less
mask = (df['tweets'].str.len() > 1)
df = df.loc[mask]
df['sentiment'].value_counts()

1    798734
0    798157
Name: sentiment, dtype: int64

In [None]:
# remove invalid tweets with more than 140 characters
mask = (df['tweets'].str.len() < 141)
df = df.loc[mask]
df['sentiment'].value_counts()

1    798456
0    797970
Name: sentiment, dtype: int64

In [None]:
#save the processed dataset
df.to_csv('/content/drive/My Drive/deep-learning/cleaned-dataset.csv', index=False)

In [None]:
drive.flush_and_unmount()
print('All changes made in this colab session should now be visible in Drive.')

All changes made in this colab session should now be visible in Drive.
