In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 

import re
import nltk
nltk.download('stopwords')
import time
import warnings
warnings.filterwarnings("ignore")
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import LancasterStemmer
from sklearn.utils import shuffle


from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
train_df = pd.read_csv('/content/drive/MyDrive/257_Project/train.csv', encoding="ISO-8859-1", header=None)
train_df.columns = ['polarity', 'id', 'date', 'query', 'user', 'tweet']

test_df = pd.read_csv('/content/drive/MyDrive/257_Project/test.csv', encoding="ISO-8859-1", header=None)
test_df.columns = ['polarity', 'id', 'date', 'query', 'user', 'tweet']

In [7]:
train_df.shape

(1600000, 6)

In [8]:
test_df.shape

(498, 6)

In [9]:
word_bank = []

# Function to remove predefined stopwords to reduce disk usage
def preprocess(text):
    review = re.sub('[^a-zA-Z]',' ',text) 
    review = review.lower()
    review = review.split()
    ps = LancasterStemmer()
    all_stopwords = stopwords.words('english')
    all_stopwords.remove('not')
    review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
    return ' '.join(review)

Training the model on 50K samples for now, to check if the model runs properly and how the classfication works.


In [10]:
train_df = shuffle(train_df,random_state=2)
train_df = train_df[1:50000]

In [11]:
train_df['polarity'].value_counts()

4    25093
0    24906
Name: polarity, dtype: int64

In [12]:
X_train = train_df['tweet'].apply(lambda x: preprocess(x))

In [13]:
y_train = train_df['polarity']
le = LabelEncoder()
y = le.fit_transform(y_train)

In [14]:
X_test = test_df['tweet']
y_test = test_df['polarity']

In [15]:
tfidf = TfidfVectorizer(max_features = 100)
X_train_tf = tfidf.fit_transform(X_train).toarray() 
X_test = tfidf.transform(X_test).toarray()

In [16]:
X_train_tf.shape, X_test.shape, y_train.shape, y_test.shape

((49999, 100), (498, 100), (49999,), (498,))

**Logistic Regression**

In [17]:
lr = LogisticRegression(random_state = 0)
start_time = time.time()
lr.fit(X_train_tf, y_train) 
print("Execution Time:", time.time()-start_time,"secs")

Execution Time: 0.30883216857910156 secs


In [18]:
y_pred_lr = lr.predict(X_test)
print("Accuracy:\n", accuracy_score(y_test, y_pred_lr))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_lr))
print("Classification Report:\n", classification_report(y_test, y_pred_lr))

Accuracy:
 0.41566265060240964
Confusion Matrix:
 [[ 66   0 111]
 [  8   0 131]
 [ 41   0 141]]
Classification Report:
               precision    recall  f1-score   support

           0       0.57      0.37      0.45       177
           2       0.00      0.00      0.00       139
           4       0.37      0.77      0.50       182

    accuracy                           0.42       498
   macro avg       0.31      0.38      0.32       498
weighted avg       0.34      0.42      0.34       498



Current accuracy of the model using logistic regression on 400k samples : ~42%



**Decision Tree**

In [19]:
dc = DecisionTreeClassifier(criterion = 'entropy', random_state = 22)
start_time = time.time()
dc.fit(X_train_tf, y_train)
print("Execution Time:", time.time()-start_time,"secs")

Execution Time: 4.841554880142212 secs


In [20]:
y_pred_dc = dc.predict(X_test)
print("Accuracy:\n", accuracy_score(y_test, y_pred_dc))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_dc))
print("Classification Report:\n", classification_report(y_test, y_pred_dc))

Accuracy:
 0.41365461847389556
Confusion Matrix:
 [[ 71   0 106]
 [ 22   0 117]
 [ 47   0 135]]
Classification Report:
               precision    recall  f1-score   support

           0       0.51      0.40      0.45       177
           2       0.00      0.00      0.00       139
           4       0.38      0.74      0.50       182

    accuracy                           0.41       498
   macro avg       0.29      0.38      0.32       498
weighted avg       0.32      0.41      0.34       498



Current accuracy of the model using decision tree classifier on 50k samples : ~41%

**Naive Bayes Classifier**

In [21]:
nb = MultinomialNB()
start_time = time.time()
nb.fit(X_train_tf,y_train)
print("Execution Time:", time.time()-start_time,"secs")

Execution Time: 0.035933732986450195 secs


In [22]:
y_pred_nb = nb.predict(X_test)
print("Accuracy:\n", accuracy_score(y_test, y_pred_nb))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_nb))
print("Classification Report:\n", classification_report(y_test, y_pred_nb))

Accuracy:
 0.41767068273092367
Confusion Matrix:
 [[ 66   0 111]
 [ 10   0 129]
 [ 40   0 142]]
Classification Report:
               precision    recall  f1-score   support

           0       0.57      0.37      0.45       177
           2       0.00      0.00      0.00       139
           4       0.37      0.78      0.50       182

    accuracy                           0.42       498
   macro avg       0.31      0.38      0.32       498
weighted avg       0.34      0.42      0.34       498



Current accuracy of the model using decision tree classifier on 400k samples : ~42%