In [11]:
import pandas as pd

from google.colab import drive
drive.mount('/content/drive/')

data = pd.read_csv('/content/drive/My Drive/CAIS_winterproj/data.csv')
data.info()

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600000 entries, 0 to 1599999
Data columns (total 3 columns):
 #   Column   Non-Null Count    Dtype 
---  ------   --------------    ----- 
 0   valence  1600000 non-null  int64 
 1   author   1600000 non-null  object
 2   tweet    1600000 non-null  object
dtypes: int64(1), object(2)
memory usage: 36.6+ MB


Unnamed: 0,valence,author,tweet
0,0,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,scotthamilton,is upset that he can't update his Facebook by ...
2,0,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,ElleCTF,my whole body feels itchy and like its on fire
4,0,Karoli,"@nationwideclass no, it's not behaving at all...."
...,...,...,...
1499995,4,evangenitals,Receiving the first check made out to the Evan...
1499996,4,jasonavp,http://twitpic.com/6vluu - R.I.P Jeffrey Guy D...
1499997,4,ZeJuria,@Clark427 Thanks! You're a doll for saying so
1499998,4,blooooooo,@heycassadee Can I ask a question?


In [2]:
!pip install keras
EMBEDDINGS_DIR = '/content/drive/My Drive/CAIS_winterproj/glove.6B.50d.txt'

import numpy as np
from keras.preprocessing.text import text_to_word_sequence
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import os

EMBEDDING_DIM = 50

#Assign data
valence = data['valence'].values
tweets = data['tweet'].values

#Tokenize the tweets (convert sentence to sequence of words)
tokenizer = Tokenizer()
tokenizer.fit_on_texts(tweets)

sequences = tokenizer.texts_to_sequences(tweets)
word_index = tokenizer.word_index

#Pad sequences
preproccessed_data = pad_sequences(sequences)

#Load pre-trained word embeddings
embeddings_index = {}
f = open(EMBEDDINGS_DIR,'rb')
for line in f:
    values = line.split()
    word = values[0].decode('UTF-8')
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

# prepare word embedding matrix
num_words = len(word_index)+1
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i >= num_words:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector



In [27]:
idx = 200
print("Original tweet:\n", tweets[idx])
print()
print("Tweet after tokenization and padding:\n", preproccessed_data[idx])

Original tweet:
 Job Interview in Cardiff today, wish me luck! Got about 3 hours sleep 

Tweet after tokenization and padding:
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0  332 1069   10 6735   40  114   14
  327   48   61  104  223  116]


In [3]:
print("Training Data Size: ", preproccessed_data.shape)
print("Number of Tweets: ", preproccessed_data.shape[0])
print("Max Tweet Length: ", preproccessed_data.shape[1])
print()
print("Valence Size: ", valence.shape)

Training Data Size:  (1600000, 118)
Number of Tweets:  1600000
Max Tweet Length:  118

Valence Size:  (1600000,)


In [29]:
import pickle
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

X_train, X_test, y_train, y_test = train_test_split(preproccessed_data, valence, test_size=0.70, random_state=4, stratify=valence)

from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(penalty='l2', solver='liblinear', C=0.1)
clf.fit(X_train, y_train)

pred = clf.predict(X_test)
scores = clf.predict_proba(X_test)[:,1]   

print('Accuracy: ', accuracy_score(y_test, pred))
print('AUROC: ', roc_auc_score(y_test, scores))
print(classification_report(y_test, pred))

Accuracy:  0.5007142857142857
AUROC:  0.6003896462579719
              precision    recall  f1-score   support

           0       0.56      0.01      0.01    560000
           4       0.50      1.00      0.67    560000

    accuracy                           0.50   1120000
   macro avg       0.53      0.50      0.34   1120000
weighted avg       0.53      0.50      0.34   1120000

