In [14]:
import nltk
import string
import re
import pandas as pd
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from joblib import dump, load

In [3]:
def clean_text(text):
    text =  "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [ps.stem(word) for word in tokens if word not in stopwords]
    return text

stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()
ngram_vectorizer = CountVectorizer(ngram_range=(2,2))
tfidf_vectorizer = TfidfVectorizer(analyzer=clean_text)



In [4]:
data = pd.read_pickle('tweets.pkl')
data['clean_text'] = data['text'].apply(lambda x: clean_text(x))

In [5]:
X_test, X_train, y_test, y_train = train_test_split(data[['text']], data['party'], test_size=0.2)

In [6]:
tfidf_fit = tfidf_vectorizer.fit(X_train['text'])
tfidf_train = tfidf_fit.transform(X_train['text'])
tfidf_test = tfidf_fit.transform(X_test['text'])

In [9]:
gnb = GaussianNB().fit(tfidf_train.toarray(), y_train)

In [10]:
def predict(sample):
    clean_sample = clean_text(sample)
    inputdict = {'text':clean_sample}
    inputdf = pd.DataFrame(inputdict)
    p = tfidf_vectorizer.transform(inputdf)
    return gnb.predict(p.toarray())

In [11]:
print(predict("Thank you Kevin. Many Trump votes were routed to Biden. The highly respected Michigan Judge released this epic report. True all over the Country. This Fake Election can no longer stand. Get moving Republicans. Big Swing State Win!  75,000,000 VOTES."))

['Republican Party']


In [16]:
dump(gnb, 'model.joblib')

['model.joblib']