Reading in Data

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
senate_df = pd.read_csv("https://drive.google.com/uc?id=1mMSM9bfu4SxeRDvGsM32eU4Cb6t80cj9")
senate_df = senate_df.drop("Unnamed: 0", axis = 1)

In [None]:
rep_df_one = pd.read_csv("https://drive.google.com/uc?id=1xPTbtPQozq4wU43TU8iBXrnSnZTVeoMq")
rep_df_two = pd.read_csv("https://drive.google.com/uc?id=1HWIbkxPReQDAhDIiuPlGTK6kq1TTJseM")
rep_df_three = pd.read_csv("https://drive.google.com/uc?id=1C0-BVLXd9LjYTXkjJ_rUCsOQx46itrv9")
rep_df = pd.concat([rep_df_one, rep_df_two, rep_df_three], ignore_index=True).drop("Unnamed: 0", axis=1)

In [None]:
rep_df["Text"] = rep_df["Text"].str.replace("\n\n", " ")
rep_df["Text"] = rep_df["Text"].str.replace("\n", " ")
rep_df["Text"] = rep_df["Text"].str.split("https").str[0]
senate_df["Text"] = senate_df["Text"].str.replace("\n\n", " ")
senate_df["Text"] = senate_df["Text"].str.replace("\n", " ")
senate_df["Text"] = senate_df["Text"].str.split("https").str[0]

In [None]:
senate_info = pd.read_csv("https://raw.githubusercontent.com/kyleschmoyer/DataSci/main/senate_info-3.csv")
rep_info = pd.read_csv("https://raw.githubusercontent.com/kyleschmoyer/DataSci/main/rep_info-3.csv")
senate_info.loc[88, "State"] = "Vermont"

In [None]:
senate_comb = senate_df.merge(
    senate_info,
    left_on="User",
    right_on="Twitter").drop("Twitter", axis = 1)
senate_comb["Date"] = pd.to_datetime(senate_comb["Date"], format='%Y-%m-%d').dt.date

In [None]:
rep_comb = rep_df.merge(
    rep_info,
    left_on="User",
    right_on="Twitter").drop("Twitter", axis = 1)
rep_comb["Date"] = pd.to_datetime(rep_comb["Date"], format='%Y-%m-%d').dt.date

In [None]:
comb = pd.concat([senate_comb, rep_comb], ignore_index = True)
comb = comb[comb["Party"] != "Independent"].reset_index()

Early KNN

In [None]:
from sklearn.model_selection import cross_val_score
for i in range(25, 101, 25):
  pipeline = make_pipeline(
      TfidfVectorizer(ngram_range=(1,1), stop_words='english'),
      KNeighborsClassifier(n_neighbors=i, metric="euclidean"))
  pipeline.fit(comb["Text"], comb["Party"])
  scores = cross_val_score(
      pipeline, comb["Text"][:10000], comb["Party"][:10000],
      scoring="f1_macro",
      cv=10)
  print(i, scores.mean())

25 0.3827769174301621
50 0.7600670737143437
75 0.8133248348841609
100 0.811735429655922


Testing other hyperparameters like N-Gram Range

In [None]:
pipeline = make_pipeline(
    TfidfVectorizer(ngram_range=(2,2), stop_words='english'),
    KNeighborsClassifier(n_neighbors=75, metric="euclidean"))
pipeline.fit(comb["Text"], comb["Party"])
scores = cross_val_score(
    pipeline, comb["Text"].head(10000), comb["Party"].head(10000),
    scoring="f1_macro",
    cv=10)
print(scores.mean())

0.3448671824397209


In [None]:
pipeline = make_pipeline(
    TfidfVectorizer(ngram_range=(1,1), stop_words='english'),
    KNeighborsClassifier(n_neighbors=75, metric="euclidean"))
pipeline.fit(comb["Text"], comb["Party"])

In [None]:
scores = cross_val_score(
    pipeline, comb["Text"][10000:20000], comb["Party"][10000:20000],
    scoring="f1_macro",
    cv=10)
print(scores.mean())

0.7886279660188751


Texting more hyper parameters.

In [None]:
pipeline = make_pipeline(
    TfidfVectorizer(ngram_range=(1,1), stop_words='english', max_df = 100),
    KNeighborsClassifier(n_neighbors=75, metric="euclidean"))
pipeline.fit(comb["Text"], comb["Party"])
scores = cross_val_score(
    pipeline, comb["Text"].head(10000), comb["Party"].head(10000),
    scoring="f1_macro",
    cv=10)
print(scores.mean())

0.6234769365196435


Additional distance metric testing.

In [None]:
pipeline = make_pipeline(
    TfidfVectorizer(ngram_range=(1,1), stop_words='english'),
    KNeighborsClassifier(n_neighbors=75, metric="manhattan"))
pipeline.fit(comb["Text"], comb["Party"])
scores = cross_val_score(
    pipeline, comb["Text"].head(10000), comb["Party"].head(10000),
    scoring="f1_macro",
    cv=10)
print(scores.mean())

0.3459354095902688


New model - Multinomial Naive Bayes which performed better.

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score

for i in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]:
  pipeline = make_pipeline(
      TfidfVectorizer(ngram_range=(1,1), stop_words='english'),
      MultinomialNB(alpha=i))
  pipeline.fit(comb["Text"], comb["Party"])
  scores = cross_val_score(
      pipeline, comb["Text"].head(50000), comb["Party"].head(50000),
      scoring="f1_macro",
      cv=10)
  print(i, scores.mean())

0.1 0.7886622612932054
0.2 0.7910221764745298
0.3 0.7912043047510213
0.4 0.7914182349379468
0.5 0.7912694288061093
0.6 0.7907060945669466
0.7 0.790355835086651
0.8 0.7894662039666291
0.9 0.7888140625584052
1 0.7885645271360493


Testing on first 10000 because its slower.

In [None]:
pipeline = make_pipeline(
    TfidfVectorizer(ngram_range=(1,1), stop_words='english'),
    MultinomialNB(alpha=0.4))
pipeline.fit(comb["Text"], comb["Party"])
scores = cross_val_score(
    pipeline, comb["Text"].head(10000), comb["Party"].head(10000),
    scoring="f1_macro",
    cv=10)
print(scores.mean())

0.8539533544168677


Attempted ensemble methods but not as fast or as high schoring.

In [None]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

model1 = KNeighborsClassifier(n_neighbors=75, metric="euclidean")
model2 = MultinomialNB(alpha=0.4)

stacker = StackingClassifier([("Model 1", model1),
                             ("Model 2", model2)],
                            final_estimator=LogisticRegression())
pipeline = make_pipeline(TfidfVectorizer(ngram_range=(1,1), stop_words='english'),
                         stacker)
scores = cross_val_score(
      pipeline, comb["Text"].head(10000), comb["Party"].head(10000),
      scoring="f1_macro",
      cv=5)
print(scores.mean())

0.8089182516803962


In [None]:
!pip install snscrape
import snscrape.modules.twitter as sntwitter

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


Final function to predict the tweets from particular users along with examples below.

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer


model2 = MultinomialNB(alpha=0.4)

pipeline = make_pipeline(TfidfVectorizer(ngram_range=(1,1), stop_words='english'),
                         model2)
pipeline.fit(comb["Text"], comb["Party"])
def prediction(username):
  df = pd.DataFrame(columns = ["Text"])

  for i, tweet in enumerate(sntwitter.TwitterSearchScraper("from:"+ username).get_items()):
    if i == 1000:
      break
    new_addition = pd.DataFrame({"Text": [tweet.rawContent]})
    df = pd.concat([df, new_addition], ignore_index = True)
  print(pd.Series(pipeline.predict(df["Text"])).value_counts())

  
  
  

In [None]:
prediction("DonaldJTrumpJr")

Republican Party    823
Democratic Party    177
dtype: int64


In [None]:
prediction("GovRonDeSantis")

Republican Party    746
Democratic Party    254
dtype: int64


In [None]:
prediction("POTUS")

Democratic Party    759
Republican Party    241
dtype: int64


In [None]:
prediction("laurenboebert")

Republican Party    829
Democratic Party    171
dtype: int64


In [None]:
prediction("KamalaHarris")

Democratic Party    881
Republican Party    119
dtype: int64


In [None]:
prediction("BarackObama")

Democratic Party    828
Republican Party    172
dtype: int64


In [None]:
prediction("MichelleObama")

Democratic Party    802
Republican Party    198
dtype: int64


In [None]:
prediction("BenShapiro")

Republican Party    773
Democratic Party    227
dtype: int64


In [None]:
prediction("StevenCrowder")

Republican Party    714
Democratic Party    286
dtype: int64


In [None]:
prediction("maddow")

Republican Party    660
Democratic Party    340
dtype: int64


In [None]:
prediction("HillaryClinton")

Democratic Party    759
Republican Party    241
dtype: int64
