In [15]:
import csv
import re
import json
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import maxabs_scale
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

In [4]:
# Read in train and test sets
train_tweets = []
train_scores = []
with open("training.1600000.processed.noemoticon.csv", "r") as train_file:
    trainreader = csv.reader(train_file)
    try:
        for row in trainreader:
            score = int(row[0])/4
            if score != 0.5:
                train_tweets.append(row[5])
                train_scores.append(score)
    except UnicodeDecodeError:
        print(row)
        
test_tweets = []
test_scores = []
with open("testdata.manual.2009.06.14.csv", "r") as test_file:
    testreader = csv.reader(test_file)
    try:
        for row in testreader:
            score = int(row[0])/4
            if score != 0.5:
                test_tweets.append(row[5])
                test_scores.append(score)
    except UnicodeDecodeError:
        print(row)

In [5]:
# Preprocess data
def preprocess(tweets):
    REPLACE_NO_SPACE = re.compile("[.;:!\'?,\"()\[\]]")
    REPLACE_USER = re.compile("@.+?\s")
    tweets = [REPLACE_NO_SPACE.sub("", line.lower()) for line in tweets]
    tweets = [REPLACE_USER.sub("@_ ", line.lower()) for line in tweets]
    return tweets

train_tweets = preprocess(train_tweets)
test_tweets = preprocess(test_tweets)

In [6]:
print(train_tweets[9])

@_ que me muera  


In [7]:
# Generate features from raw text
cv = CountVectorizer(binary=True)
cv.fit(train_tweets)
X = cv.transform(train_tweets)
X_test = cv.transform(test_tweets)
# Normalize data
X_scale = maxabs_scale(X)
X_test_scale = maxabs_scale(X_test)

In [8]:
# Get training and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    X, train_scores, train_size = 0.75
)

In [9]:
# Find hyperparameters
for c in [0.01, 0.05, 0.25, 0.5, 1]:
    lr = LogisticRegression(C=c, solver="saga")
    lr.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, lr.predict(X_val))))

Accuracy for C=0.01: 0.7885725
Accuracy for C=0.05: 0.7957125
Accuracy for C=0.25: 0.7983575
Accuracy for C=0.5: 0.79806
Accuracy for C=1: 0.797245


In [10]:
# Train model with all data
final_model = LogisticRegression(C=0.25, solver="saga")
final_model.fit(X, train_scores)
print ("Final Accuracy: %s" 
       % accuracy_score(test_scores, final_model.predict(X_test)))

Final Accuracy: 0.8105849582172702


In [12]:
# Load movie data
movie_tweets = json.load(open("movie_tweets.json", "r"))

In [21]:
# Get average sentiment for each movie
for movie, data in movie_tweets.items():
    tweets = [tweet["text"] for tweet in data]
    feats = cv.transform(preprocess(tweets))
    preds = final_model.predict_proba(feats)
    print(movie, np.mean(preds, axis=0)[1])

Detective Pikachu 0.6924287408022424
Gemini Man 0.6057201698656581
The Irishman 0.6363357537136467
Angel Has Fallen 0.5734007681801666
Hobbs & Shaw 0.7836779496613233
Vivarium 0.6904062787399092
Birds of Prey 0.590655869945211
Jojo Rabbit 0.5926116692459874
Spies in Disguise 0.7200359795857291
Frozen II 0.5549495306836143
Uncut Gems 0.7386042052888141
Ad Astra 0.7152963790682016
Klaus 0.6470385221514315
The Two Popes 0.689480314859459
Marriage Story 0.5990777834393655
Avengers: Endgame 0.8944595986806599
Trolls World Tour 0.7384290491211631
Coffee & Kareem 0.7798542512906961
Knives Out 0.6483327451711969
Queen & Slim 0.6147024526372544
