In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd

In [3]:
!pip install torchvision
import torch
import torchvision

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [4]:
train_on_gpu = torch.cuda.is_available()
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)

if not train_on_gpu:
    print('CUDA is not available.  Training on CPU ... To use GPU, go under edit > notebook settings')
else:
    print('CUDA is available!  Training on GPU ...')
    print(gpu_info)

CUDA is available!  Training on GPU ...
Mon Nov 28 21:26:21 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   42C    P8     9W /  70W |      3MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------

In [5]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 

from sklearn.utils import shuffle

import re
import nltk
nltk.download('stopwords')
import time
import warnings
warnings.filterwarnings("ignore")
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import LancasterStemmer

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
train_df = pd.read_csv('/content/drive/MyDrive/257_Project/train.csv', encoding="ISO-8859-1", header=None)
train_df.columns = ['polarity', 'id', 'date', 'query', 'user', 'tweet']

test_df = pd.read_csv('/content/drive/MyDrive/257_Project/test.csv', encoding="ISO-8859-1", header=None)
test_df.columns = ['polarity', 'id', 'date', 'query', 'user', 'tweet']

In [7]:
train_df.shape

(1600000, 6)

In [8]:
test_df.shape

(498, 6)

In [9]:
word_bank = []

# Function to remove predefined stopwords to reduce disk usage
def preprocess(text):
    review = re.sub('[^a-zA-Z]',' ',text) 
    review = review.lower()
    review = review.split()
    ps = LancasterStemmer()
    all_stopwords = stopwords.words('english')
    all_stopwords.remove('not')
    review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
    return ' '.join(review)

Training the model on 50000 samples for time being, will do it on total data for final submission


In [10]:
train_df = shuffle(train_df,random_state=2)
train_df = train_df[1:800000]

In [11]:
train_df['polarity'].value_counts()

4    400618
0    399381
Name: polarity, dtype: int64

In [12]:
train_df['polarity'] = train_df['polarity'].replace(4,1)
train_df

Unnamed: 0,polarity,id,date,query,user,tweet
408251,0,2059364084,Sat Jun 06 16:47:03 PDT 2009,NO_QUERY,MrsAmarieB,my baby's growing up
1559739,1,2186151891,Mon Jun 15 18:25:49 PDT 2009,NO_QUERY,epallaviccini,Painted Black-Rolling Stones..the best!
571248,0,2208723981,Wed Jun 17 09:33:02 PDT 2009,NO_QUERY,Kiwitabby,"kk, i'm logging off now BYEZZ!"
524639,0,2193564503,Tue Jun 16 08:37:46 PDT 2009,NO_QUERY,annaqui,Shitty shitty shitty news today
311150,0,2001240587,Tue Jun 02 00:15:30 PDT 2009,NO_QUERY,smellyocheese,@Askmewhats * hugs* what's wrong?
...,...,...,...,...,...,...
763296,0,2298265644,Tue Jun 23 11:27:45 PDT 2009,NO_QUERY,literaryescapis,This is so not good. The hubby is looking for...
8054,0,1470056461,Tue Apr 07 08:13:02 PDT 2009,NO_QUERY,kategardiner,r.i.p. fave jeans. If you weren't discontinued...
13823,0,1553425279,Sat Apr 18 14:13:55 PDT 2009,NO_QUERY,Sabs604,@ErinAsh10 ME too!!!! Its so boring no you and...
1436455,1,2060993244,Sat Jun 06 19:58:33 PDT 2009,NO_QUERY,KingdomGeek,@kristarella I just introduced the ability for...


In [13]:
test_df

Unnamed: 0,polarity,id,date,query,user,tweet
0,4,3,Mon May 11 03:17:40 UTC 2009,kindle2,tpryan,@stellargirl I loooooooovvvvvveee my Kindle2. ...
1,4,4,Mon May 11 03:18:03 UTC 2009,kindle2,vcu451,Reading my kindle2... Love it... Lee childs i...
2,4,5,Mon May 11 03:18:54 UTC 2009,kindle2,chadfu,"Ok, first assesment of the #kindle2 ...it fuck..."
3,4,6,Mon May 11 03:19:04 UTC 2009,kindle2,SIX15,@kenburbary You'll love your Kindle2. I've had...
4,4,7,Mon May 11 03:21:41 UTC 2009,kindle2,yamarama,@mikefish Fair enough. But i have the Kindle2...
...,...,...,...,...,...,...
493,2,14072,Sun Jun 14 04:31:43 UTC 2009,latex,proggit,Ask Programming: LaTeX or InDesign?: submitted...
494,0,14073,Sun Jun 14 04:32:17 UTC 2009,latex,sam33r,"On that note, I hate Word. I hate Pages. I hat..."
495,4,14074,Sun Jun 14 04:36:34 UTC 2009,latex,iamtheonlyjosie,Ahhh... back in a *real* text editing environm...
496,0,14075,Sun Jun 14 21:36:07 UTC 2009,iran,plutopup7,"Trouble in Iran, I see. Hmm. Iran. Iran so far..."


In [14]:
test_df['polarity'] = test_df['polarity'].replace(2,1)
test_df

Unnamed: 0,polarity,id,date,query,user,tweet
0,4,3,Mon May 11 03:17:40 UTC 2009,kindle2,tpryan,@stellargirl I loooooooovvvvvveee my Kindle2. ...
1,4,4,Mon May 11 03:18:03 UTC 2009,kindle2,vcu451,Reading my kindle2... Love it... Lee childs i...
2,4,5,Mon May 11 03:18:54 UTC 2009,kindle2,chadfu,"Ok, first assesment of the #kindle2 ...it fuck..."
3,4,6,Mon May 11 03:19:04 UTC 2009,kindle2,SIX15,@kenburbary You'll love your Kindle2. I've had...
4,4,7,Mon May 11 03:21:41 UTC 2009,kindle2,yamarama,@mikefish Fair enough. But i have the Kindle2...
...,...,...,...,...,...,...
493,1,14072,Sun Jun 14 04:31:43 UTC 2009,latex,proggit,Ask Programming: LaTeX or InDesign?: submitted...
494,0,14073,Sun Jun 14 04:32:17 UTC 2009,latex,sam33r,"On that note, I hate Word. I hate Pages. I hat..."
495,4,14074,Sun Jun 14 04:36:34 UTC 2009,latex,iamtheonlyjosie,Ahhh... back in a *real* text editing environm...
496,0,14075,Sun Jun 14 21:36:07 UTC 2009,iran,plutopup7,"Trouble in Iran, I see. Hmm. Iran. Iran so far..."


In [15]:
test_df['polarity'] = test_df['polarity'].replace(4,1)
test_df

Unnamed: 0,polarity,id,date,query,user,tweet
0,1,3,Mon May 11 03:17:40 UTC 2009,kindle2,tpryan,@stellargirl I loooooooovvvvvveee my Kindle2. ...
1,1,4,Mon May 11 03:18:03 UTC 2009,kindle2,vcu451,Reading my kindle2... Love it... Lee childs i...
2,1,5,Mon May 11 03:18:54 UTC 2009,kindle2,chadfu,"Ok, first assesment of the #kindle2 ...it fuck..."
3,1,6,Mon May 11 03:19:04 UTC 2009,kindle2,SIX15,@kenburbary You'll love your Kindle2. I've had...
4,1,7,Mon May 11 03:21:41 UTC 2009,kindle2,yamarama,@mikefish Fair enough. But i have the Kindle2...
...,...,...,...,...,...,...
493,1,14072,Sun Jun 14 04:31:43 UTC 2009,latex,proggit,Ask Programming: LaTeX or InDesign?: submitted...
494,0,14073,Sun Jun 14 04:32:17 UTC 2009,latex,sam33r,"On that note, I hate Word. I hate Pages. I hat..."
495,1,14074,Sun Jun 14 04:36:34 UTC 2009,latex,iamtheonlyjosie,Ahhh... back in a *real* text editing environm...
496,0,14075,Sun Jun 14 21:36:07 UTC 2009,iran,plutopup7,"Trouble in Iran, I see. Hmm. Iran. Iran so far..."


In [16]:
X_train = train_df['tweet'].apply(lambda x: preprocess(x))

In [17]:
y_train = train_df['polarity']
le = LabelEncoder()
y = le.fit_transform(y_train)

In [18]:
X_test = test_df['tweet']
y_test = test_df['polarity']

TEST SET PRE PROCESSING


In [19]:
tfidf = TfidfVectorizer(max_features = 600)
X_train_tf = tfidf.fit_transform(X_train).toarray() 
X_test = tfidf.transform(X_test).toarray()

In [20]:
X_train_tf.shape, X_test.shape, y_train.shape, y_test.shape

((799999, 600), (498, 600), (799999,), (498,))

**Logistic Regreession**

In [21]:
lr = LogisticRegression(random_state = 0)
start_time = time.time()
lr.fit(X_train_tf, y_train) 
print("Execution Time:", time.time()-start_time,"secs")

Execution Time: 45.39226508140564 secs


In [22]:
y_pred_lr = lr.predict(X_test)
print("Accuracy:\n", accuracy_score(y_test, y_pred_lr))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_lr))
print("Classification Report:\n", classification_report(y_test, y_pred_lr))

Accuracy:
 0.7449799196787149
Confusion Matrix:
 [[104  73]
 [ 54 267]]
Classification Report:
               precision    recall  f1-score   support

           0       0.66      0.59      0.62       177
           1       0.79      0.83      0.81       321

    accuracy                           0.74       498
   macro avg       0.72      0.71      0.71       498
weighted avg       0.74      0.74      0.74       498



Current accuracy of the model using logistic regression on 400k samples : ~74%

**Decision Tree Classifier**

In [23]:
dc = DecisionTreeClassifier(criterion = 'entropy', random_state = 22)
start_time = time.time()
dc.fit(X_train_tf, y_train)
print("Execution Time:", time.time()-start_time,"secs")

Execution Time: 840.0275394916534 secs


In [24]:
y_pred_dc = dc.predict(X_test)
print("Accuracy:\n", accuracy_score(y_test, y_pred_dc))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_dc))
print("Classification Report:\n", classification_report(y_test, y_pred_dc))

Accuracy:
 0.6506024096385542
Confusion Matrix:
 [[ 91  86]
 [ 88 233]]
Classification Report:
               precision    recall  f1-score   support

           0       0.51      0.51      0.51       177
           1       0.73      0.73      0.73       321

    accuracy                           0.65       498
   macro avg       0.62      0.62      0.62       498
weighted avg       0.65      0.65      0.65       498



Current accuracy of the model using decision tree classifier on 400k samples : ~65%

**Naive Bayes Classifier**

In [25]:
nb = MultinomialNB()
start_time = time.time()
nb.fit(X_train_tf,y_train)
print("Execution Time:", time.time()-start_time,"secs")

Execution Time: 1.6841657161712646 secs


In [26]:
y_pred_nb = nb.predict(X_test)
print("Accuracy:\n", accuracy_score(y_test, y_pred_nb))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_nb))
print("Classification Report:\n", classification_report(y_test, y_pred_nb))

Accuracy:
 0.7248995983935743
Confusion Matrix:
 [[108  69]
 [ 68 253]]
Classification Report:
               precision    recall  f1-score   support

           0       0.61      0.61      0.61       177
           1       0.79      0.79      0.79       321

    accuracy                           0.72       498
   macro avg       0.70      0.70      0.70       498
weighted avg       0.72      0.72      0.72       498



Current accuracy of the model using decision tree classifier on 400k samples : ~72%