In [74]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [75]:
import pandas as pd

In [76]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 

from sklearn.utils import shuffle

import re
import nltk
nltk.download('stopwords')
import time
import warnings
warnings.filterwarnings("ignore")
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import LancasterStemmer

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [77]:
train_df = pd.read_csv('/content/drive/MyDrive/257_Project/train.csv', encoding="ISO-8859-1", header=None)
train_df.columns = ['polarity', 'id', 'date', 'query', 'user', 'tweet']

test_df = pd.read_csv('/content/drive/MyDrive/257_Project/test.csv', encoding="ISO-8859-1", header=None)
test_df.columns = ['polarity', 'id', 'date', 'query', 'user', 'tweet']


In [78]:
train_df.shape

(1600000, 6)

In [79]:
test_df.shape

(498, 6)

In [80]:
word_bank = []

def preprocess(text):
    review = re.sub('[^a-zA-Z]',' ',text) 
    review = review.lower()
    review = review.split()
    ps = LancasterStemmer()
    all_stopwords = stopwords.words('english')
    all_stopwords.remove('not')
    review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
    return ' '.join(review)

Training the model on 50000 samples for time being, will do it on total data for final submission


In [81]:
train_df = shuffle(train_df,random_state=2)
train_df = train_df[1:50000]

In [82]:
train_df['polarity'].value_counts()

4    25093
0    24906
Name: polarity, dtype: int64

In [83]:
X_train = train_df['tweet'].apply(lambda x: preprocess(x))

In [84]:
y_train = train_df['polarity']
le = LabelEncoder()
y = le.fit_transform(y_train)

In [85]:
X_test = test_df['tweet']
y_test = test_df['polarity']

In [86]:
tfidf = TfidfVectorizer(max_features = 600)
X_train_tf = tfidf.fit_transform(X_train).toarray() 
X_test_tf = tfidf.transform(X_test).toarray()

In [87]:
X_train_tf.shape, X_test_tf.shape, y_train.shape, y_test.shape

((49999, 600), (498, 600), (49999,), (498,))

**Logistic Regreession**

In [88]:
lr = LogisticRegression(random_state = 0)
start_time = time.time()
lr.fit(X_train_tf, y_train) 
print("Execution Time:", time.time()-start_time,"secs")

Execution Time: 1.807976484298706 secs


In [89]:
y_pred_lr = lr.predict(X_test_tf)
print("Accuracy:\n", accuracy_score(y_test, y_pred_lr))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_lr))
print("Classification Report:\n", classification_report(y_test, y_pred_lr))

Accuracy:
 0.4799196787148594
Confusion Matrix:
 [[ 90   0  87]
 [ 10   0 129]
 [ 33   0 149]]
Classification Report:
               precision    recall  f1-score   support

           0       0.68      0.51      0.58       177
           2       0.00      0.00      0.00       139
           4       0.41      0.82      0.54       182

    accuracy                           0.48       498
   macro avg       0.36      0.44      0.38       498
weighted avg       0.39      0.48      0.41       498



Current accuracy of the model using logistic regression on 50k samples : ~48%

**Decision Tree Classifier**

In [90]:
dc = DecisionTreeClassifier(criterion = 'entropy', random_state = 22)
start_time = time.time()
dc.fit(X_train_tf, y_train)
print("Execution Time:", time.time()-start_time,"secs")

Execution Time: 38.95518350601196 secs


In [91]:
y_pred_dc = dc.predict(X_test_tf)
print("Accuracy:\n", accuracy_score(y_test, y_pred_dc))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_dc))
print("Classification Report:\n", classification_report(y_test, y_pred_dc))

Accuracy:
 0.44779116465863456
Confusion Matrix:
 [[ 98   0  79]
 [ 39   0 100]
 [ 57   0 125]]
Classification Report:
               precision    recall  f1-score   support

           0       0.51      0.55      0.53       177
           2       0.00      0.00      0.00       139
           4       0.41      0.69      0.51       182

    accuracy                           0.45       498
   macro avg       0.31      0.41      0.35       498
weighted avg       0.33      0.45      0.38       498



Current accuracy of the model using decision tree classifier on 50k samples : ~45%

**Naive Bayes Classifier**

In [92]:
nb = MultinomialNB()
start_time = time.time()
nb.fit(X_train_tf,y_train)
print("Execution Time:", time.time()-start_time,"secs")

Execution Time: 0.13629865646362305 secs


In [93]:
y_pred_nb = nb.predict(X_test_tf)
print("Accuracy:\n", accuracy_score(y_test, y_pred_nb))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_nb))
print("Classification Report:\n", classification_report(y_test, y_pred_nb))

Accuracy:
 0.4839357429718876
Confusion Matrix:
 [[100   0  77]
 [ 17   0 122]
 [ 41   0 141]]
Classification Report:
               precision    recall  f1-score   support

           0       0.63      0.56      0.60       177
           2       0.00      0.00      0.00       139
           4       0.41      0.77      0.54       182

    accuracy                           0.48       498
   macro avg       0.35      0.45      0.38       498
weighted avg       0.38      0.48      0.41       498

