# 08. Most Like Finder

This notebook finds the five other Senators or Representatives who are most and least like the user input Twitter account (must be an account for a member of the 116th Congress).

## Imports

In [8]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import PCA
# from sklearn.metrics import silhouette_score, pairwise_distances
from sklearn.preprocessing import StandardScaler

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import pairwise_distances, cosine_distances, cosine_similarity

from sklearn.manifold import TSNE

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [9]:
# Read in data
data = pd.read_csv('../data/vectors_labels_df.csv')

## Create Dataframe Using Best Model

In [11]:
X_ = data['liststring']

cvec = CountVectorizer(max_features = 250, ngram_range = (1, 1))

X_c = pd.DataFrame(cvec.fit_transform(X_).toarray(), columns = cvec.get_feature_names())

# Applying the Standard Scaler to word vectors
sc = StandardScaler()
X_sc_c = sc.fit_transform(X_c)

# Applying standard scaler, then applying PCA to word vectors
pca_sc = PCA(n_components = 0.95)
X_pca_sc_c = pca_sc.fit_transform(X_sc_c)

# TSNE w/ scaled data
tsne = TSNE(n_components=2)
X_tsne_pca_sc_c = tsne.fit_transform(X_pca_sc_c)

In [13]:
# Creating a dataframe from vectorized dataset
df = pd.DataFrame(X_tsne_pca_sc_c)

In [16]:
# Calculating cosine similarity scores
similarity = cosine_similarity(df)

df2 = pd.DataFrame(similarity)

In [17]:
# Merging names back onto Cosine Similarity data
columns_keep2 =['screen_name']
data2= data.reset_index()
data2 = data2[columns_keep2]

# Merge
data_ = data2.join(df2)
data_.set_index('screen_name',inplace=True)

# getting a list of all screen names
list_of_names = data['screen_name'].to_list()

# applying list of names to columns
data_.columns=list_of_names

# making text lower
data_.index= data_.index.str.lower()

## Function to Find Most, Least Like Twitter User of Interest

In [53]:
def sim_scores(twitter_handle):
    twitter_handle = twitter_handle.lower()
    most = pd.DataFrame(data_.loc[twitter_handle].sort_values(ascending=False)[1:6])
    least = pd.DataFrame(data_.loc[twitter_handle].sort_values(ascending=True)[:5])
    return most, least

In [61]:
# Enter name of twitter account here:
most, least = sim_scores('SenDuckworth')

display(most, least)

Unnamed: 0,senduckworth
RepPressley,0.999999
NydiaVelazquez,0.999998
SenFeinstein,0.99999
RepChuyGarcia,0.999986
LeaderHoyer,0.999931


Unnamed: 0,senduckworth
RepTrey,-1.0
RepRussFulcher,-0.999998
RepPeteStauber,-0.999996
RepMikeTurner,-0.999995
SenJohnThune,-0.999989


In [21]:
# https://www.geeksforgeeks.org/python-measure-similarity-between-two-sentences-using-cosine-similarity/
# https://leantechblog.wordpress.com/2020/08/23/how-to-estimate-text-similarity-with-python/
# https://www.datacamp.com/community/tutorials/recommender-systems-python
# https://stackabuse.com/creating-a-simple-recommender-system-in-python-using-pandas/