# Markov Model Design
This code reads in each one of the datasets created by our webscraper and concatenates them into one large data frame. The data frame is then normalized for each origin category to create a markov simulation

In [1]:
import time
import pandas as pd
import numpy as np
import scipy.linalg as la
import random
np.set_printoptions(suppress=True)
pd.set_option('display.float_format', lambda x: '%.10f' % x)

In [2]:
#sets an ideology per channel
ideology = pd.read_csv("Hard Tagged Channels.csv")
ideology = ideology[ideology[["CHANNEL_TITLE","IDEOLOGY"]] != np.nan][["CHANNEL_TITLE","IDEOLOGY"]]
ideology['IDEOLOGY'].value_counts()

IDW                        128
Alt-light                   97
Social Justice              84
Conspiracy                  80
Partisan Right              52
Partisan Left               52
Alt-right                   44
Libertarian                 36
Socialist                   36
Anti-Theist                 29
MRA                         18
Religious Conservative      13
Revolutionary Socialist     10
Anti-white                   3
Name: IDEOLOGY, dtype: int64

In [3]:
#creates the Markov Probability matrix
test = pd.read_csv("combined counter.csv").drop(columns = "Unnamed: 0")

#sets categories for current channel
test = test.rename({"Category":"Current Ideology"}, axis = 1)

#sets categories into Next channel and removes others
test = test.rename({"IDEOLOGY":"Next Ideology"}, axis = 1)[["Current Ideology","Next Ideology","Count","Current Views"]].fillna("Others")
test = test.rename({"Count":"Amount"}, axis = 1)
test = test[test["Next Ideology"] != "Others"].reset_index().drop(columns = "index")

#creates the impiressions
test["Impressions"] = test["Current Views"]*test["Amount"]
test = test.drop(columns = ["Amount","Current Views"]).groupby(['Current Ideology','Next Ideology']).sum().reset_index()
test


Unnamed: 0,Current Ideology,Next Ideology,Impressions
0,Alt-light,Alt-light,63374705
1,Alt-light,Conspiracy,9915005
2,Alt-light,IDW,3931830
3,Alt-light,Libertarian,3321973
4,Alt-light,MRA,7064
...,...,...,...
101,Socialist,Partisan Left,1963377
102,Socialist,Partisan Right,55713
103,Socialist,Revolutionary Socialist,65896
104,Socialist,Social Justice,4711973


In [4]:
test = pd.merge(test,test.groupby("Current Ideology")['Impressions'].sum(), how = "left",left_on = "Current Ideology",right_on = "Current Ideology")
test["Probability"] = test["Impressions_x"]/test["Impressions_y"]
test = test.drop(columns = ["Impressions_x","Impressions_y"])
test

Unnamed: 0,Current Ideology,Next Ideology,Probability
0,Alt-light,Alt-light,0.6628679528
1,Alt-light,Conspiracy,0.1037060301
2,Alt-light,IDW,0.0411249899
3,Alt-light,Libertarian,0.0347461884
4,Alt-light,MRA,0.0000738859
...,...,...,...
101,Socialist,Partisan Left,0.0505827649
102,Socialist,Partisan Right,0.0014353421
103,Socialist,Revolutionary Socialist,0.0016976882
104,Socialist,Social Justice,0.1213952402


In [5]:
#Creates the Markov Matrix
#test = test.groupby(["Current Ideology","Next Ideology"]).sum().reset_index()
topics = test["Current Ideology"].unique()
markovmatrix = pd.DataFrame()
for topic in topics:
    temp = test[test["Current Ideology"] == topic].transpose().rename(columns = test["Next Ideology"], index = {"Probability":topic}).iloc[2:]
    markovmatrix = markovmatrix.append(temp)
markovmatrix = markovmatrix.fillna(0)
markovmatrix = markovmatrix.reindex(sorted(markovmatrix.columns),axis = 1)#.drop(index = "Others")
markovmatrix
#markovmatrix.to_csv("Markov Matrix without Others.csv")

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort,


Unnamed: 0,Alt-light,Alt-right,Anti-Theist,Anti-white,Conspiracy,IDW,Libertarian,MRA,Partisan Left,Partisan Right,Religious Conservative,Revolutionary Socialist,Social Justice,Socialist
Alt-light,0.6628679528,0.0,0.0,0.0,0.1037060301,0.0411249899,0.0347461884,7.38859e-05,0.0390889325,0.1098891552,0.0,0.0,0.0079567222,0.000546143
Alt-right,0.0012798221,0.2501936689,0.0,0.0,0.0023410038,0.1436086824,0.5350839812,0.0,0.0079805667,0.0074071344,0.0,0.0,0.0520269796,7.81609e-05
Anti-Theist,0.0,0.0,0.8759086586,0.0,0.0061586314,0.0094117244,0.0,0.0,0.0718145195,0.0028025127,0.0,0.0,0.0339039533,0.0
Anti-white,0.0005384772,0.0,0.0,0.3410279165,0.0,0.0,0.0,0.0,0.4863300289,0.0379745915,0.0,0.0,0.1341289859,0.0
Conspiracy,0.0,0.0,0.0,0.0025356753,0.6485428628,0.0115891363,0.0994848178,0.0,0.0375931379,0.1743382851,0.0,0.0,0.0251521189,0.000763966
IDW,0.0231807989,0.0,0.0081702689,0.0,0.0097457597,0.9379670184,0.0051567176,0.0,0.0064821166,0.0007958597,0.0,0.0,0.0066207374,0.0018807229
Libertarian,0.0,0.0,0.0,0.0,0.0,0.1309264473,0.811836877,0.0,0.0142271575,0.0217882735,0.0,0.0,0.0173757474,0.0038454973
MRA,0.0114460749,0.0,0.0,0.0,0.0843006403,0.1761954746,0.041284816,0.494835464,0.0239259826,0.0382537344,0.0,0.0,0.1297578131,0.0
Partisan Left,0.0,0.0,0.0,0.0,0.0034158181,0.0,9.51765e-05,0.0,0.8369089563,0.0277278462,0.0,0.0,0.1318507177,1.4852e-06
Partisan Right,0.0035792355,0.0,0.0,0.0,0.0010794844,6.72042e-05,0.1583148072,0.0,0.052478132,0.7801212532,0.0,0.0,0.0043598834,0.0


In [6]:
#Markov Steady State Simulation
#matrix = pd.read_csv("Markov Matrix without Others.csv").drop(columns = "Unnamed: 0")
vec = np.zeros((len(markovmatrix),1))
vec[0:len(markovmatrix)] = 1/len(markovmatrix)

for _ in range(100):
    vec = np.dot(np.transpose(markovmatrix),vec)

probdf = pd.DataFrame(vec, index = ['Alt-light', 'Alt-right', 'Anti-Theist', 'Anti-white', 'Conspiracy',
       'IDW', 'Libertarian', 'MRA', 'Partisan Left', 'Partisan Right',
       'Religious Conservative', 'Revolutionary Socialist', 'Social Justice',
       'Socialist'])
probdf = probdf.rename({0:"Steady State Probability"},axis = 1)
probdf
#probdf.to_csv("Steady State Probability without Others.csv")

Unnamed: 0,Steady State Probability
Alt-light,0.0121494791
Alt-right,0.0
Anti-Theist,0.0105913078
Anti-white,5.86406e-05
Conspiracy,0.0133317371
IDW,0.1590053198
Libertarian,0.0522580782
MRA,1.7808e-06
Partisan Left,0.129394042
Partisan Right,0.044289455


This model is scaled taking into accounts of video views, thus it shows what video types more people are exposed towards