Importing csv file

In [167]:
import pandas as pd
import re

In [168]:
df = pd.read_csv('../data/Tweets.csv', usecols=['text', 'selected_text', 'sentiment'])
pd.set_option('display.max_columns', None)

print(df.head())

                                                text  \
0                I`d have responded, if I were going   
1      Sooo SAD I will miss you here in San Diego!!!   
2                          my boss is bullying me...   
3                     what interview! leave me alone   
4   Sons of ****, why couldn`t they put them on t...   

                         selected_text sentiment  
0  I`d have responded, if I were going   neutral  
1                             Sooo SAD  negative  
2                          bullying me  negative  
3                       leave me alone  negative  
4                        Sons of ****,  negative  


In [169]:
df.sentiment.value_counts()

neutral     11118
positive     8582
negative     7781
Name: sentiment, dtype: int64

Removing records with null values

In [170]:
df = df.drop(df[df['selected_text'].isna()].index.tolist())
df.reset_index(drop=True, inplace=True)
print(df.isna().sum())

text             0
selected_text    0
sentiment        0
dtype: int64


Cleaning up the text

In [173]:
processed_features = []

for sentence in range(0, len(df)):
    # Remove all the special characters
    processed_feature = re.sub(r'\W', ' ', str(df.loc[sentence, 'selected_text']))

    # remove all single characters
    processed_feature= re.sub(r'\s+[a-zA-Z]\s+', ' ', processed_feature)

    # Remove single characters from the start
    processed_feature = re.sub(r'\^[a-zA-Z]\s+', ' ', processed_feature) 

    # Substituting multiple spaces with single space
    processed_feature = re.sub(r'\s+', ' ', processed_feature, flags=re.I)

    # Removing prefixed 'b'
    processed_feature = re.sub(r'^b\s+', '', processed_feature)

    # Converting to Lowercase
    processed_feature = processed_feature.lower().strip()

    processed_features.append(processed_feature)

In [174]:
processed_features

['i have responded if were going',
 'sooo sad',
 'bullying me',
 'leave me alone',
 'sons of',
 'http www dothebouncy com smf some shameless plugging for the best rangers forum on earth',
 'fun',
 'soooo high',
 'both of you',
 'wow just became cooler',
 'as much as love to be hopeful reckon the chances are minimal i never gonna get my cake and stuff',
 'like',
 'dangerously',
 'lost',
 'test test from the lg env2',
 'uh oh am sunburned',
 'sigh',
 'sick',
 'onna',
 'hes just not that into you',
 'oh marly m so sorry hope you find her soon 3 3',
 'interesting',
 'is cleaning the house for her family who is comming later today',
 'gotta restart my computer thought win7 was supposed to put an end to the constant rebootiness',
 'see wat mean bout foll0w friidays it called lose f0llowers friday smh',
 'the free fillin app on my ipod is fun im addicted',
 'i sorry',
 'no internet',
 'fun',
 'power back up not working too',
 'quite heavenly',
 'hope',
 'well so much for being unhappy for abo

In [175]:
df['selected_text'] = processed_features

Creating train, val, and test datasets 

In [176]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

In [191]:
df_fulltrain, df_test = train_test_split(df[['selected_text', 'sentiment']], test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_fulltrain, test_size=0.25, random_state=1)

len(df_train), len(df_val), len(df_test), len(df)

(16488, 5496, 5496, 27480)

In [192]:
df_train.shape, df_val.shape, df_test.shape

((16488, 2), (5496, 2), (5496, 2))

In [193]:
df_train.columns

Index(['selected_text', 'sentiment'], dtype='object')

In [194]:
y_train = df_train.sentiment.values
y_val = df_val.sentiment.values
y_test = df_test.sentiment.values

del df_train['sentiment']
del df_val['sentiment']
del df_test['sentiment']

In [181]:
df_train

Unnamed: 0,selected_text
20135,listening to fountain of youth by supastition ...
12611,no sir they did not was amazed when woke up th...
10342,awww that sucks but they re so awesome when yo...
12499,leno last show tonight
8661,thanks
...,...
4233,happy mothers day to all the grown happy the l...
26341,i like sunshine but do not like heat headaches...
14927,i don feel good haven felt good in 4 days
1448,ohhh please


In [182]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [183]:
tfid = TfidfVectorizer()
X_train = tfid.fit_transform(df_train['selected_text']).toarray()
X_val = tfid.transform(df_val['selected_text']).toarray()

In [184]:
X_train.shape

(16488, 12923)

In [187]:
from sklearn.ensemble import RandomForestClassifier

In [188]:
model = RandomForestClassifier(random_state=1)
model.fit(X_train, y_train)

In [196]:
y_pred = model.predict(X_val)

In [197]:
from sklearn.metrics import accuracy_score, confusion_matrix 

In [199]:
print(accuracy_score(y_val, y_pred))

0.8033114992721979
