In [8]:
import csv
import re
import json
import pickle
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.preprocessing import maxabs_scale
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [10]:
# Read in train and test sets
train_tweets = []
train_scores = []
with open("training_data/training.1600000.processed.noemoticon.csv", "r") as train_file:
    trainreader = csv.reader(train_file)
    try:
        for row in trainreader:
            score = int(row[0])/4
            if score != 0.5:
                train_tweets.append(row[5])
                train_scores.append(score)
    except UnicodeDecodeError:
        print(row)
        
test_tweets = []
test_scores = []
with open("training_data/testdata.manual.2009.06.14.csv", "r") as test_file:
    testreader = csv.reader(test_file)
    try:
        for row in testreader:
            score = int(row[0])/4
            if score != 0.5:
                test_tweets.append(row[5])
                test_scores.append(score)
    except UnicodeDecodeError:
        print(row)

In [11]:
# Preprocess data
def preprocess(tweets):
    REPLACE_NO_SPACE = re.compile("[.;:!\'?,\"()\[\]]")
    REPLACE_USER = re.compile("@.+?\s")
    tweets = [REPLACE_NO_SPACE.sub("", line.lower()) for line in tweets]
    tweets = [REPLACE_USER.sub("@_ ", line.lower()) for line in tweets]
    return tweets

train_tweets = preprocess(train_tweets)
test_tweets = preprocess(test_tweets)

In [12]:
# Generate features from raw text
cv = CountVectorizer(binary=True, ngram_range=(1,3))
cv.fit(train_tweets)
X = cv.transform(train_tweets)
X_test = cv.transform(test_tweets)
# Normalize data
X_scale = maxabs_scale(X)
X_test_scale = maxabs_scale(X_test)

In [13]:
# Get training and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    X, train_scores, train_size = 0.75
)

In [15]:
# Find hyperparameters
tune = False
if tune:
    for c in [0.01, 0.05, 0.25, 0.5, 1]:
        lr = LogisticRegression(C=c, solver="saga")
        lr.fit(X_train, y_train)
        print ("Accuracy for C=%s: %s" 
               % (c, accuracy_score(y_val, lr.predict(X_val))))

In [14]:
# Train model with all data
final_model = LogisticRegression(C=0.25, solver="saga")
final_model.fit(X, train_scores)
print ("Final Accuracy: %s" 
       % accuracy_score(test_scores, final_model.predict(X_test)))

Final Accuracy: 0.8523676880222841


In [56]:
pickle.dump(cv, open("vectorizer.p", "wb"))
pickle.dump(final_model, open("model.p", "wb"))