In [1]:
from wordcloud import WordCloud, STOPWORDS 
import matplotlib.pyplot as plt 
from sklearn import metrics
from sklearn.metrics import confusion_matrix
%matplotlib inline
import seaborn as sns

import numpy as np # linear algebra
import pandas as pd #data processing

import os
import re
import nltk

In [2]:
train = pd.read_csv('dataset/train.csv')
test = pd.read_csv('dataset/test.csv')

In [3]:
train.shape , test.shape

((3235, 6), (1387, 5))

In [4]:
train.head()

Unnamed: 0,id,original_text,lang,retweet_count,original_author,sentiment_class
0,1.245025e+18,Happy #MothersDay to all you amazing mothers o...,en,0,BeenXXPired,0
1,1.245759e+18,Happy Mothers Day Mum - I'm sorry I can't be t...,en,1,FestiveFeeling,0
2,1.246087e+18,Happy mothers day To all This doing a mothers ...,en,0,KrisAllenSak,-1
3,1.244803e+18,Happy mothers day to this beautiful woman...ro...,en,0,Queenuchee,0
4,1.244876e+18,Remembering the 3 most amazing ladies who made...,en,0,brittan17446794,-1


In [6]:
print(train.isnull().sum())
print('************')
print(test.isnull().sum())

id                 0
original_text      0
lang               4
retweet_count      4
original_author    0
sentiment_class    0
dtype: int64
************
id                 0
original_text      0
lang               0
retweet_count      1
original_author    0
dtype: int64


In [9]:
train["sentiment_class"].value_counts()

 0    1701
-1     769
 1     765
Name: sentiment_class, dtype: int64

In [10]:
combi = train.append(test, ignore_index=True) 
combi.shape

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


(4622, 6)

In [14]:
combi.drop(['id','lang','retweet_count','original_author'],axis=1,inplace=True)
combi.shape

(4622, 2)

# Tokenization

In [11]:
#Downloading nltk data
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Stop words

In [12]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
print(stop_words)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

# Lemmatization

In [13]:
from nltk.stem import WordNetLemmatizer
lemmatizer=WordNetLemmatizer()

In [16]:
for index,row in combi.iterrows():
    filter_sentence = ''
    
    sentence = row['original_text']
    sentence = re.sub(r'[^\w\s]','',sentence) #cleaning
    
    words = nltk.word_tokenize(sentence) #tokenization
    
    words = [w for w in words if not w in stop_words]  #stopwords removal
    
    for word in words:
        filter_sentence = filter_sentence + ' ' + str(lemmatizer.lemmatize(word)).lower()
        
    combi.loc[index,'original_text'] = filter_sentence


In [17]:
combi.head()

Unnamed: 0,original_text,sentiment_class
0,happy mothersday amazing mother i know hard a...,0.0
1,happy mothers day mum im sorry i cant bring m...,0.0
2,happy mother day to this mother day work toda...,-1.0
3,happy mother day beautiful womanroyalty sooth...,0.0
4,remembering 3 amazing lady made i my late gra...,-1.0


In [18]:
combi = combi[['original_text','sentiment_class']]

In [20]:
combi.head()

Unnamed: 0,original_text,sentiment_class
0,happy mothersday amazing mother i know hard a...,0.0
1,happy mothers day mum im sorry i cant bring m...,0.0
2,happy mother day to this mother day work toda...,-1.0
3,happy mother day beautiful womanroyalty sooth...,0.0
4,remembering 3 amazing lady made i my late gra...,-1.0


In [21]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [25]:
X = combi['original_text']
Y = combi['sentiment_class']

In [26]:
#Feature extraction using count vectorization and tfidf.
count_vectorizer = CountVectorizer()
count_vectorizer.fit_transform(X)
freq_term_matrix = count_vectorizer.transform(X)
tfidf = TfidfTransformer(norm="l2")
tfidf.fit(freq_term_matrix)
tf_idf_matrix = tfidf.fit_transform(freq_term_matrix)

In [27]:
tf_idf_matrix.toarray().shape

(4622, 19092)

In [28]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    See full source and example: 
    http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
    
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    thresh = cm.max() / 2.
    for i, j in iter.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [36]:
XTrain = tf_idf_matrix[:3235,:]
XTest  = tf_idf_matrix[3235:,:]

#split in samples
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(XTrain, train['sentiment_class'], random_state=0)

In [39]:
%%time
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
rf = RandomForestClassifier(n_estimators=400, random_state=11).fit(X_train, y_train) 
prediction = rf.predict(X_test)

print(f1_score(y_test, prediction,average='micro'))

0.4956736711990111
Wall time: 30.3 s


In [40]:
test_pred = rf.predict(XTest) 
test['sentiment_class'] = test_pred 
submission = test[['id','sentiment_class']] 
submission.to_csv('sub_rf_bow.csv', index=False)