In [66]:
import pandas as pd
import scipy.sparse
import sqlite3
from tqdm import tqdm
from time import perf_counter

import utils

## Apply names to topics
Note: names manually created in a csv file based on an interpretation of the 25 terms most similar to the topic. I also made a few very rough categorizations, the most important one of which is 'politics'.

In [2]:
topics = pd.read_csv('data/atom_topic_names.csv', sep=';')

In [3]:
# Sort topics as text (0,1,10,100,101...11...2) because that's how the columns of the atom matrix are ordered
topics['DATM'] = topics['DATM'].astype(str)
topics = topics.sort_values('DATM')

In [4]:
topics.head()

Unnamed: 0,DATM,Name,Category
233,0,Unlikable behavior,Human
121,1,"Kingdoms, kings and underworlds",Film
218,10,Superlatives,Grammar
216,100,Strong reactions,Other
22,101,Bodyparts,Grammar


In [5]:
names = topics['Name'].tolist()

In [11]:
pol_names = topics["Name"][topics["Category"] == "Politics"].tolist()

In [6]:
atom_mat = scipy.sparse.load_npz('data/ngram_atoms.npz')
atoms = atom_mat.todense()
#atoms = pd.DataFrame.sparse.from_spmatrix(atom_mat, columns=names)
atoms = pd.DataFrame(atoms, columns=names)

In [10]:
atoms.head()

Unnamed: 0,Unlikable behavior,"Kingdoms, kings and underworlds",Superlatives,Strong reactions,Bodyparts,Female actors,Will,Remembering,Male actors 2,Doesn't,...,Berating,Have,Male names,Shit and assholes,Measurements,Extremely,Latin words,Names,Terrified and sociopathic,Occurrence in sequence
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.014851,0.0,0.009901,0.0,0.00495,0.00495,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.014286,0.0,0.009524,0.0,0.004762,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.046512,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Append the topic loadings to sql database

In [13]:
comments = pd.read_csv('data/comments.csv', sep=';')

In [16]:
sql_db = './data/film_discussions'
conn = sqlite3.connect(sql_db)

In [17]:
comment_ids = comments["comment_id"].to_list()

In [18]:
atoms["comment_id"] = comment_ids

In [None]:
# This gave an error but did create the table with column names
atoms.to_sql("topic_scores", conn, chunksize=10000)
# Optimal chunksize is 10,000 according to https://acepor.github.io/2017/08/03/using-chunksize/

In [54]:
chunksize = 9999
n_columns = len(atoms.columns)
start = 0

with tqdm(total=len(atoms)) as pbar:
    data = [tuple(x) for x in atoms.loc[start:chunksize].to_numpy()]
    while len(data) > 0: 
        with conn:
            n_rows = utils.add_rows(conn, 'topic_scores', n_columns, data)
            
        start = start+chunksize+1
        end = start+chunksize
        
        try:
            data = [tuple(x) for x in atoms.loc[start:end].to_numpy()]
        except:
            try:
                data = [tuple(x) for x in atoms.loc[start:].to_numpy()]
            except:
                data = None
                
        pbar.update(n_rows)

100%|██████████████████████████████████████████████████████████████████████| 2118317/2118317 [06:39<00:00, 5297.46it/s]


## Collect data from the database

In [68]:
t1_start = perf_counter()  

sql = """
    SELECT submissions.title, submissions.num_comments, submissions.created, 
    AVG(topic_scores."Extreme political identities") AS "Pol_identity"
    FROM comments
    INNER JOIN topic_scores ON comments.comment_id = topic_scores.comment_id
    INNER JOIN submissions ON comments.submission_id = submissions.submission_id
    GROUP BY submissions.submission_id;
    """

grouped = pd.read_sql(sql, conn)

print(perf_counter()-t1_start)

5.61258270000053


In [70]:
grouped.sort_values('Pol_identity', ascending=False).head(10)

Unnamed: 0,title,num_comments,created,Pol_identity
984,Official Discussion - Moxie [SPOILERS],175,1614971128,0.024246
804,Official Discussion: Black Christmas [SPOILERS],423,1576206029,0.020419
76,Official Discussion: Dear White People [SPOILERS],207,1414205548,0.019298
882,Official Discussion - Black is King [SPOILERS],303,1596199386,0.016486
201,Official Discussion: Chi-Raq [SPOILERS],49,1449285175,0.016342
253,Official Discussion - Barbershop: The Next Cut...,138,1460769571,0.015132
1094,Official Discussion - There's Someone Inside Y...,103,1633658366,0.015061
267,Official Discussion - Neighbors 2: Sorority Ri...,207,1463712543,0.013848
806,Official Discussion: Richard Jewell [SPOILERS],866,1576267220,0.012879
686,Official Discussion: Little [SPOILERS],75,1555034424,0.012711
