In [1]:
import glob
import re
import gensim
import copy
import numpy
import pandas as pd
import pickle
import random
from statistics import stdev
from gensim.models import Word2Vec
from sklearn.utils import resample
from gensim.models import KeyedVectors
from gensim.test.utils import datapath

In [2]:
random.seed(91326)

model = Word2Vec.load("intermediate/crime.model")
keys = model.wv.key_to_index
rwords = [key for key, index in keys.items()]

similarities = {}
for word in rwords:
    similarity_score = model.wv.similarity("krime", word)
    similarities[word] = similarity_score

wordsmean = sum(similarities.values())/len(similarities)
std = stdev(similarities.values())
maxi = max(similarities.values()) 
mini = min(similarities.values())
print(wordsmean, std, maxi, mini)

0.07950838997114718 0.05673649706308635 1.0 -0.115716994


In [3]:
def chrono_train_crime(n_iterations, current_corpus, previous_model, output_model):
    ''' Models the current corpus by initializing with the vectors of  '''
    ''' the previous model, outputs similarity scores of new model,    '''
    ''' and saves that new model for the next round of modeling        '''
    racialization = []
    hatecrime = []
    latinx = []
    asian = []
    black = []
    placebo = []

    tempmodel = Word2Vec.load(previous_model)

    for k in range(n_iterations):
        model = tempmodel
        sentence_samples = resample(current_corpus)
        model.train(sentence_samples, total_examples = len(sentence_samples), epochs = 1)
        racialization.append(model.wv.similarity('krime','racialization'))
        hatecrime.append(model.wv.similarity('hatekrime', 'azn'))
        latinx.append(model.wv.similarity('krime', 'latinxdad'))
        asian.append(model.wv.similarity('krime', 'azn'))
        black.append(model.wv.similarity('krime', 'blck'))
        
        nums = random.sample(range(1,len(model.wv.key_to_index)), 100)
        keys = model.wv.key_to_index
        rwords = [key for key, index in keys.items() if index in nums]
        similarities = {}
        for word in rwords:
            similarity_score = model.wv.similarity("healthcare", word)
            similarities[word] = similarity_score
    
        rwordsmean = sum(similarities.values())/len(similarities)
        placebo.append(rwordsmean)
        
        run = k+1
        print("Finished with run %d out of %d" % (run, n_iterations))
    model.save(output_model)
    stats = racialization, hatecrime, latinx, asian, black, placebo
    return list(stats) 


In [4]:
eras = ['2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022']

results_all = []

with open('intermediate/crime_corpus.pickle', 'rb') as f:
    list_of_lists = pickle.load(f)

In [None]:
results_2004 = chrono_train_crime(100, list_of_lists[0], "intermediate/crime.model", "intermediate/word2vec_models/crime_model1_of_2004.model")
results_all.append(results_2004)

results_2005 = chrono_train_crime(100, list_of_lists[1], "intermediate/word2vec_models/crime_model1_of_2004.model", "intermediate/word2vec_models/crime_model2_of_2005.model")
results_all.append(results_2005)

results_2006 = chrono_train_crime(100, list_of_lists[2], "intermediate/word2vec_models/crime_model2_of_2005.model", "intermediate/word2vec_models/crime_model3_of_2006.model")
results_all.append(results_2006)

results_2007 = chrono_train_crime(100, list_of_lists[3], "intermediate/word2vec_models/crime_model3_of_2006.model", "intermediate/word2vec_models/crime_model4_of_2007.model")
results_all.append(results_2007)

results_2008 = chrono_train_crime(100, list_of_lists[4], "intermediate/word2vec_models/crime_model4_of_2007.model", "intermediate/word2vec_models/crime_model5_of_2008.model")
results_all.append(results_2008)

results_2009 = chrono_train_crime(100, list_of_lists[5], "intermediate/word2vec_models/crime_model5_of_2008.model", "intermediate/word2vec_models/crime_model6_of_2009.model")
results_all.append(results_2009)

results_2010 = chrono_train_crime(100, list_of_lists[6], "intermediate/word2vec_models/crime_model6_of_2009.model", "intermediate/word2vec_models/crime_model7_of_2010.model")
results_all.append(results_2010)

results_2011 = chrono_train_crime(100, list_of_lists[7], "intermediate/word2vec_models/crime_model7_of_2010.model", "intermediate/word2vec_models/crime_model8_of_2011.model")
results_all.append(results_2011)

results_2012 = chrono_train_crime(100, list_of_lists[8], "intermediate/word2vec_models/crime_model8_of_2011.model", "intermediate/word2vec_models/crime_model9_of_2012.model")
results_all.append(results_2012)

results_2013 = chrono_train_crime(100, list_of_lists[9], "intermediate/word2vec_models/crime_model9_of_2012.model", "intermediate/word2vec_models/crime_model10_of_2013.model")
results_all.append(results_2013)

results_2014 = chrono_train_crime(100, list_of_lists[10], "intermediate/word2vec_models/crime_model10_of_2013.model", "intermediate/word2vec_models/crime_model11_of_2014.model")
results_all.append(results_2014)

results_2015 = chrono_train_crime(100, list_of_lists[11], "intermediate/word2vec_models/crime_model11_of_2014.model", "intermediate/word2vec_models/crime_model12_of_2015.model")
results_all.append(results_2015)

results_2016 = chrono_train_crime(100, list_of_lists[12], "intermediate/word2vec_models/crime_model12_of_2015.model", "intermediate/word2vec_models/crime_model13_of_2016.model")
results_all.append(results_2016)

results_2017 = chrono_train_crime(100, list_of_lists[13], "intermediate/word2vec_models/crime_model13_of_2016.model", "intermediate/word2vec_models/crime_model14_of_2017.model")
results_all.append(results_2017)

results_2018 = chrono_train_crime(100, list_of_lists[14], "intermediate/word2vec_models/crime_model14_of_2017.model", "intermediate/word2vec_models/crime_model15_of_2018.model")
results_all.append(results_2018)

results_2019 = chrono_train_crime(100, list_of_lists[15], "intermediate/word2vec_models/crime_model15_of_2018.model", "intermediate/word2vec_models/crime_model16_of_2019.model")
results_all.append(results_2019)

results_2020 = chrono_train_crime(100, list_of_lists[16], "intermediate/word2vec_models/crime_model16_of_2019.model", "intermediate/word2vec_models/crime_model17_of_2020.model")
results_all.append(results_2020)

results_2021 = chrono_train_crime(100, list_of_lists[17], "intermediate/word2vec_models/crime_model17_of_2020.model", "intermediate/word2vec_models/crime_model18_of_2021.model")
results_all.append(results_2021)

results_2022 = chrono_train_crime(100, list_of_lists[18], "intermediate/word2vec_models/crime_model18_of_2021.model", "intermediate/word2vec_models/crime_model19_of_2022.model")
results_all.append(results_2022)

Finished with run 1 out of 100
Finished with run 2 out of 100
Finished with run 3 out of 100
Finished with run 4 out of 100
Finished with run 5 out of 100
Finished with run 6 out of 100
Finished with run 7 out of 100
Finished with run 8 out of 100
Finished with run 9 out of 100
Finished with run 10 out of 100
Finished with run 11 out of 100
Finished with run 12 out of 100
Finished with run 13 out of 100
Finished with run 14 out of 100
Finished with run 15 out of 100
Finished with run 16 out of 100
Finished with run 17 out of 100
Finished with run 18 out of 100
Finished with run 19 out of 100
Finished with run 20 out of 100
Finished with run 21 out of 100
Finished with run 22 out of 100
Finished with run 23 out of 100
Finished with run 24 out of 100
Finished with run 25 out of 100
Finished with run 26 out of 100
Finished with run 27 out of 100
Finished with run 28 out of 100
Finished with run 29 out of 100
Finished with run 30 out of 100
Finished with run 31 out of 100
Finished with run

Finished with run 58 out of 100
Finished with run 59 out of 100
Finished with run 60 out of 100
Finished with run 61 out of 100
Finished with run 62 out of 100
Finished with run 63 out of 100
Finished with run 64 out of 100
Finished with run 65 out of 100
Finished with run 66 out of 100
Finished with run 67 out of 100
Finished with run 68 out of 100
Finished with run 69 out of 100
Finished with run 70 out of 100
Finished with run 71 out of 100
Finished with run 72 out of 100
Finished with run 73 out of 100
Finished with run 74 out of 100
Finished with run 75 out of 100
Finished with run 76 out of 100
Finished with run 77 out of 100
Finished with run 78 out of 100
Finished with run 79 out of 100
Finished with run 80 out of 100
Finished with run 81 out of 100
Finished with run 82 out of 100
Finished with run 83 out of 100
Finished with run 84 out of 100
Finished with run 85 out of 100
Finished with run 86 out of 100
Finished with run 87 out of 100
Finished with run 88 out of 100
Finished

Finished with run 15 out of 100
Finished with run 16 out of 100
Finished with run 17 out of 100
Finished with run 18 out of 100
Finished with run 19 out of 100
Finished with run 20 out of 100
Finished with run 21 out of 100
Finished with run 22 out of 100
Finished with run 23 out of 100
Finished with run 24 out of 100
Finished with run 25 out of 100


In [None]:
racialization_similarity = []
hatecrime_similarity = []
latinx_similarity = []
asian_similarity = []
black_similarity = []
placebo_similarity = []

for i in range(0, len(results_all)):
    racialization = results_all[i][0]
    racialization_similarity.append(racialization)
    hatecrime = results_all[i][1]
    hatecrime_similarity.append(hatecrime)
    latinx = results_all[i][2]
    latinx_similarity.append(latinx)
    asian = results_all[i][3]
    asian_similarity.append(asian)
    black = results_all[i][4]
    black_similarity.append(black)
    placebo = results_all[i][5]
    placebo_similarity.append(placebo)
    
stat_types = [racialization_similarity, hatecrime_similarity, latinx_similarity, asian_similarity, black_similarity, placebo_similarity]

## Saving model output

with open('aggregate_models/crime_chrono_model_output.pickle', 'wb') as f:
    pickle.dump(stat_types, f) 