In [5]:
import sqlite3
import pickle
import pandas as pd

In [6]:
# connect to sqlite database
conn = sqlite3.connect('C:\\Users\\jerem\\github\\twitoff.sqlite3')

In [7]:

def get_data(query, conn):
    '''Function to get data from SQLite DB'''

    cursor = conn.cursor()
    result = cursor.execute(query).fetchall()

    # Get columns from cursor object
    columns = list(map(lambda x: x[0], cursor.description))

    # Assign to DataFrame
    df = pd.DataFrame(data=result, columns=columns)
    return df

In [8]:
import pickle

sql = '''
SELECT
	tweet.id,
	tweet.tweet,
	tweet.embedding,
	user.username
FROM tweet
JOIN user on tweet.user_id = user.id;
'''

df = get_data(sql, conn)
df['embedding_decoded'] = df.embedding.apply(lambda x: pickle.loads(x))
print(df.shape)
df.head(3)

OperationalError: no such table: tweet

In [None]:
user1_embeddings = df.embedding_decoded[df.username == 'barackobama']
user2_embeddings = df.embedding_decoded[df.username == 'jimmyfallon']
embeddings = pd.concat([user1_embeddings, user2_embeddings])

embeddings_df = pd.DataFrame(embeddings.to_list(),
                             columns=[f'dom{i}' for i in range(300)])
labels = np.concatenate([np.ones(len(user1_embeddings)),
                         np.zeros(len(user2_embeddings))])
print(embeddings_df.shape, labels.shape)


In [None]:

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    embeddings_df, labels, test_size=0.25, random_state=42
)
print(X_train.shape, X_test.shape)

In [None]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression(max_iter=1000)
%timeit log_reg.fit(X_train, y_train)

In [None]:
from sklearn.metrics import classification_report, plot_confusion_matrix

y_pred = log_reg.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

fig, ax = plt.subplots(figsize=(8,8))
plot_confusion_matrix(log_reg, X_test, y_test,
                      normalize='true', cmap='Blues',
                      display_labels=['Barack Obama', 'Jimmy Fallon'], ax=ax)
plt.title(f'LogReg Confusion Matrix (N={X_test.shape[0]})');

In [None]:
import spacy

# Load SpaCy pre-trained model
nlp = spacy.load('en_core_web_md', disable=['tagger', 'parser'])

def vectorize_tweet(nlp, tweet_text):
    '''This function returns the SpaCy embeddings for an input text'''
    return list(nlp(tweet_text).vector)

In [None]:
new_embedding = vectorize_tweet(nlp, "The innovation displayed during this pandemic is unprecedented.")
new_embedding[0:5]


In [None]:
pickle.dump(log_reg, open("../models/log_reg.pkl", "wb"))

In [None]:
unpickled_lr = pickle.load(open("../models/log_reg.pkl", "rb"))
unpickled_lr.predict([new_embedding])