In [16]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [17]:
!pip install torchvision
import torch
import torchvision

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [18]:
train_on_gpu = torch.cuda.is_available()
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)

if not train_on_gpu:
    print('CUDA is not available.  Training on CPU ... To use GPU, go under edit > notebook settings')
else:
    print('CUDA is available!  Training on GPU ...')
    print(gpu_info)

CUDA is available!  Training on GPU ...
Tue Dec  6 21:05:47 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   49C    P8    10W /  70W |      3MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------

In [19]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 

import re
import nltk
nltk.download('stopwords')
import time
import warnings
warnings.filterwarnings("ignore")
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import LancasterStemmer
from sklearn.utils import shuffle


from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [20]:
train_df = pd.read_csv('/content/drive/MyDrive/257_Project/train.csv', encoding="ISO-8859-1", header=None)
train_df.columns = ['polarity', 'id', 'date', 'query', 'user', 'tweet']

test_df = pd.read_csv('/content/drive/MyDrive/257_Project/test.csv', encoding="ISO-8859-1", header=None)
test_df.columns = ['polarity', 'id', 'date', 'query', 'user', 'tweet']

In [21]:
train_df.shape

(1600000, 6)

In [22]:
test_df.shape

(498, 6)

In [23]:
word_bank = []

# Function to remove predefined stopwords to reduce disk usage
def preprocess(text):
    review = re.sub('[^a-zA-Z]',' ',text) 
    review = review.lower()
    review = review.split()
    ps = LancasterStemmer()
    all_stopwords = stopwords.words('english')
    all_stopwords.remove('not')
    review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
    return ' '.join(review)

Training the model on half dataset for this milestone, will do it on total data for final submission


In [103]:
train_df = shuffle(train_df,random_state=2)
train_df = train_df[1:1600000]

Setting train set and test set in a similar format for easier processing.

In [104]:
train_df['polarity'].value_counts()

0    522
1    473
Name: polarity, dtype: int64

In [105]:
train_df['polarity'] = train_df['polarity'].replace(4,1)
train_df

Unnamed: 0,polarity,id,date,query,user,tweet
354521,0,2039739138,Thu Jun 04 22:41:01 PDT 2009,NO_QUERY,TwoToneJohnny,@sargenthouse yes it does....yes it does.
902824,1,1694494928,Mon May 04 02:53:15 PDT 2009,NO_QUERY,An_g_e_la,Just planted flovers in the school garden with...
1596229,1,2192605981,Tue Jun 16 07:17:07 PDT 2009,NO_QUERY,Shinybiscuit,@steve_gray_ I'd have known where you were com...
315590,0,2002226764,Tue Jun 02 03:33:59 PDT 2009,NO_QUERY,JennyJen2304,at work
1017860,1,1881916843,Fri May 22 05:51:02 PDT 2009,NO_QUERY,CatsWire,hopes the sun will stay now - fleamarket tomor...
...,...,...,...,...,...,...
394767,0,2055711128,Sat Jun 06 09:49:01 PDT 2009,NO_QUERY,LaughableLily,Eris chops tiny whimpering puppies into cubes ...
1212204,1,1989228741,Mon Jun 01 00:47:31 PDT 2009,NO_QUERY,justmohit,@twilightfairy if u gotta ask..?!
1211643,1,1989174006,Mon Jun 01 00:35:43 PDT 2009,NO_QUERY,nsgmusic,Good morning peeps! I hope you all have a joll...
499088,0,2186411720,Mon Jun 15 18:48:08 PDT 2009,NO_QUERY,LuzDeLaEstrella,@timhaig I know! Doesn't work out that way for...


In [106]:
test_df

Unnamed: 0,polarity,id,date,query,user,tweet
0,1,3,Mon May 11 03:17:40 UTC 2009,kindle2,tpryan,@stellargirl I loooooooovvvvvveee my Kindle2. ...
1,1,4,Mon May 11 03:18:03 UTC 2009,kindle2,vcu451,Reading my kindle2... Love it... Lee childs i...
2,1,5,Mon May 11 03:18:54 UTC 2009,kindle2,chadfu,"Ok, first assesment of the #kindle2 ...it fuck..."
3,1,6,Mon May 11 03:19:04 UTC 2009,kindle2,SIX15,@kenburbary You'll love your Kindle2. I've had...
4,1,7,Mon May 11 03:21:41 UTC 2009,kindle2,yamarama,@mikefish Fair enough. But i have the Kindle2...
...,...,...,...,...,...,...
493,1,14072,Sun Jun 14 04:31:43 UTC 2009,latex,proggit,Ask Programming: LaTeX or InDesign?: submitted...
494,0,14073,Sun Jun 14 04:32:17 UTC 2009,latex,sam33r,"On that note, I hate Word. I hate Pages. I hat..."
495,1,14074,Sun Jun 14 04:36:34 UTC 2009,latex,iamtheonlyjosie,Ahhh... back in a *real* text editing environm...
496,0,14075,Sun Jun 14 21:36:07 UTC 2009,iran,plutopup7,"Trouble in Iran, I see. Hmm. Iran. Iran so far..."


In [107]:
test_df['polarity'] = test_df['polarity'].replace(2,1)
test_df

Unnamed: 0,polarity,id,date,query,user,tweet
0,1,3,Mon May 11 03:17:40 UTC 2009,kindle2,tpryan,@stellargirl I loooooooovvvvvveee my Kindle2. ...
1,1,4,Mon May 11 03:18:03 UTC 2009,kindle2,vcu451,Reading my kindle2... Love it... Lee childs i...
2,1,5,Mon May 11 03:18:54 UTC 2009,kindle2,chadfu,"Ok, first assesment of the #kindle2 ...it fuck..."
3,1,6,Mon May 11 03:19:04 UTC 2009,kindle2,SIX15,@kenburbary You'll love your Kindle2. I've had...
4,1,7,Mon May 11 03:21:41 UTC 2009,kindle2,yamarama,@mikefish Fair enough. But i have the Kindle2...
...,...,...,...,...,...,...
493,1,14072,Sun Jun 14 04:31:43 UTC 2009,latex,proggit,Ask Programming: LaTeX or InDesign?: submitted...
494,0,14073,Sun Jun 14 04:32:17 UTC 2009,latex,sam33r,"On that note, I hate Word. I hate Pages. I hat..."
495,1,14074,Sun Jun 14 04:36:34 UTC 2009,latex,iamtheonlyjosie,Ahhh... back in a *real* text editing environm...
496,0,14075,Sun Jun 14 21:36:07 UTC 2009,iran,plutopup7,"Trouble in Iran, I see. Hmm. Iran. Iran so far..."


In [108]:
test_df['polarity'] = test_df['polarity'].replace(4,1)
test_df

Unnamed: 0,polarity,id,date,query,user,tweet
0,1,3,Mon May 11 03:17:40 UTC 2009,kindle2,tpryan,@stellargirl I loooooooovvvvvveee my Kindle2. ...
1,1,4,Mon May 11 03:18:03 UTC 2009,kindle2,vcu451,Reading my kindle2... Love it... Lee childs i...
2,1,5,Mon May 11 03:18:54 UTC 2009,kindle2,chadfu,"Ok, first assesment of the #kindle2 ...it fuck..."
3,1,6,Mon May 11 03:19:04 UTC 2009,kindle2,SIX15,@kenburbary You'll love your Kindle2. I've had...
4,1,7,Mon May 11 03:21:41 UTC 2009,kindle2,yamarama,@mikefish Fair enough. But i have the Kindle2...
...,...,...,...,...,...,...
493,1,14072,Sun Jun 14 04:31:43 UTC 2009,latex,proggit,Ask Programming: LaTeX or InDesign?: submitted...
494,0,14073,Sun Jun 14 04:32:17 UTC 2009,latex,sam33r,"On that note, I hate Word. I hate Pages. I hat..."
495,1,14074,Sun Jun 14 04:36:34 UTC 2009,latex,iamtheonlyjosie,Ahhh... back in a *real* text editing environm...
496,0,14075,Sun Jun 14 21:36:07 UTC 2009,iran,plutopup7,"Trouble in Iran, I see. Hmm. Iran. Iran so far..."


In [109]:
X_train = train_df['tweet'].apply(lambda x: preprocess(x))

In [110]:
y_train = train_df['polarity']
le = LabelEncoder()
y = le.fit_transform(y_train)

In [111]:
X_test = test_df['tweet']
y_test = test_df['polarity']

In [112]:
tfidf = TfidfVectorizer(max_features = 100)
X_train_tf = tfidf.fit_transform(X_train).toarray() 
X_test = tfidf.transform(X_test).toarray()

In [113]:
X_train_tf.shape, X_test.shape, y_train.shape, y_test.shape

((995, 100), (498, 100), (995,), (498,))

XGBoost

In [114]:
import xgboost as xgb

In [115]:
xg = xgb.XGBClassifier()
start_time = time.time()
xg.fit(X_train_tf, y_train)  
print("Execution Time:", time.time()-start_time,"secs")

Execution Time: 0.21762895584106445 secs


In [116]:
y_pred_xg = xg.predict(X_test)
print("Accuracy:\n", accuracy_score(y_test, y_pred_xg))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_xg))
print("Classification Report:\n", classification_report(y_test, y_pred_xg))

Accuracy:
 0.5943775100401606
Confusion Matrix:
 [[139  38]
 [164 157]]
Classification Report:
               precision    recall  f1-score   support

           0       0.46      0.79      0.58       177
           1       0.81      0.49      0.61       321

    accuracy                           0.59       498
   macro avg       0.63      0.64      0.59       498
weighted avg       0.68      0.59      0.60       498



Random Forest

In [117]:
rf = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
start_time = time.time()
rf.fit(X_train_tf, y_train)
print("Execution Time:", time.time()-start_time,"secs")

Execution Time: 0.0382688045501709 secs


In [118]:
y_pred_rf = rf.predict(X_test)
print("Accuracy:\n", accuracy_score(y_test, y_pred_rf))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))
print("Classification Report:\n", classification_report(y_test, y_pred_rf))

Accuracy:
 0.6024096385542169
Confusion Matrix:
 [[ 75 102]
 [ 96 225]]
Classification Report:
               precision    recall  f1-score   support

           0       0.44      0.42      0.43       177
           1       0.69      0.70      0.69       321

    accuracy                           0.60       498
   macro avg       0.56      0.56      0.56       498
weighted avg       0.60      0.60      0.60       498

