In [9]:
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

In [10]:
data = pd.read_csv('./data/training.csv', encoding="ISO-8859-1", header=None)
data.columns = ['sentiment', 'id', 'date', 'query', 'user', 'tweet']

print(data.head())

print("Size of the dataset", data.shape)

   sentiment          id                          date     query  \
0          0  1467810369  Mon Apr 06 22:19:45 PDT 2009  NO_QUERY   
1          0  1467810672  Mon Apr 06 22:19:49 PDT 2009  NO_QUERY   
2          0  1467810917  Mon Apr 06 22:19:53 PDT 2009  NO_QUERY   
3          0  1467811184  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY   
4          0  1467811193  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY   

              user                                              tweet  
0  _TheSpecialOne_  @switchfoot http://twitpic.com/2y1zl - Awww, t...  
1    scotthamilton  is upset that he can't update his Facebook by ...  
2         mattycus  @Kenichan I dived many times for the ball. Man...  
3          ElleCTF    my whole body feels itchy and like its on fire   
4           Karoli  @nationwideclass no, it's not behaving at all....  
Size of the dataset (1600000, 6)


In [11]:
print("Missing Values \n\n", data.isnull().sum())
print("Duplicated Values \n", data.duplicated().sum())

Missing Values 

 sentiment    0
id           0
date         0
query        0
user         0
tweet        0
dtype: int64
Duplicated Values 
 0


#### There are no missing or duplicate values.

In [12]:
# Preprocessing
print("Number of http links", data['tweet'].str.count('http').sum())
data['tweet'] = data['tweet'].str.replace(r'http\S+|www.\S+', '', case=False, regex=True)

print("Number of @ mentions", data['tweet'].str.count('@').sum())
data['tweet'] = data['tweet'].str.replace(r'@\S+', '', case=False, regex=True)

print("Number of # mentions", data['tweet'].str.count('#').sum())
data['tweet'] = data['tweet'].str.replace(r'#\S+', '', case=False, regex=True)

print("Number of RT", data['tweet'].str.count('RT').sum())
data['tweet'] = data['tweet'].str.replace(r'RT', '', case=False, regex=True)

Number of http links 71635
Number of @ mentions 798628
Number of # mentions 45133
Number of RT 0


In [None]:
stop_words = set(stopwords.words('english'))
stop_words.add('quot')
stop_words.add('amp')

lemma = WordNetLemmatizer()


def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'http\S+', ' ', text)
    text = re.sub('[^a-zA-Z]', ' ', text)
    text = word_tokenize(text)
    text = [item for item in text if item not in stop_words]
    text = [lemma.lemmatize(w) for w in text]
    text = [i for i in text if len(i) > 2]
    text = ' '.join(text)
    return text


data['clean_tweet'] = data['tweet'].apply(clean_text)