# Markov Model Design
This code reads in each one of the datasets created by our webscraper and concatenates them into one large data frame. The data frame is then normalized for each origin category to create a markov simulation

In [1]:
import time
import pandas as pd
import numpy as np
import scipy.linalg as la
import random
np.set_printoptions(suppress=True)
pd.set_option('display.float_format', lambda x: '%.10f' % x)

In [2]:
#sets an ideology per channel
ideology = pd.read_csv("Hard Tagged Channels.csv")
ideology = ideology[ideology[["CHANNEL_TITLE","IDEOLOGY"]] != np.nan][["CHANNEL_TITLE","IDEOLOGY"]]
ideology['IDEOLOGY'].value_counts()

IDW                        128
Alt-light                   97
Social Justice              84
Conspiracy                  80
Partisan Left               52
Partisan Right              52
Alt-right                   44
Libertarian                 36
Socialist                   36
Anti-Theist                 29
MRA                         18
Religious Conservative      13
Revolutionary Socialist     10
Anti-white                   3
Name: IDEOLOGY, dtype: int64

In [3]:
#creates the Markov Probability matrix
test = pd.read_csv("combined tags.csv").drop(columns = "Unnamed: 0")

#sets categories for current channel
test = test.rename({"Category":"Current Tag"}, axis = 1)

#sets categories into Next channel and removes others
test = test.rename({"TAGS":"Next Tag"}, axis = 1)[["Current Tag","Next Tag","Count","Current Views"]].fillna("Others")
test = test.rename({"Count":"Amount"}, axis = 1)
test = test[test["Next Tag"] != "Others"].reset_index().drop(columns = "index")

#creates the impiressions
test["Impressions"] = test["Current Views"]*test["Amount"]
test = test.drop(columns = ["Amount","Current Views"]).groupby(['Current Tag','Next Tag']).sum().reset_index()
test


Unnamed: 0,Current Tag,Next Tag,Impressions
0,AIN,AIN,11146591.0000000000
1,AIN,AntiSJW,31902081.0000000000
2,AIN,AntiTheist,411347.0000000000
3,AIN,Conspiracy,23349214.0000000000
4,AIN,Educational,3570041.0000000000
...,...,...,...
363,WhiteIdentitarian,ReligiousConservative,915106.0000000000
364,WhiteIdentitarian,SocialJustice,812895.0000000000
365,WhiteIdentitarian,StateFunded,1949233.0000000000
366,WhiteIdentitarian,TV,1610542.0000000000


In [4]:
test = pd.merge(test,test.groupby("Current Tag")['Impressions'].sum(), how = "left",left_on = "Current Tag",right_on = "Current Tag")
test["Probability"] = test["Impressions_x"]/test["Impressions_y"]
test = test.drop(columns = ["Impressions_x","Impressions_y"])
test

Unnamed: 0,Current Tag,Next Tag,Probability
0,AIN,AIN,0.0863963971
1,AIN,AntiSJW,0.2472706550
2,AIN,AntiTheist,0.0031883200
3,AIN,Conspiracy,0.1809780195
4,AIN,Educational,0.0276711220
...,...,...,...
363,WhiteIdentitarian,ReligiousConservative,0.0317333971
364,WhiteIdentitarian,SocialJustice,0.0281889965
365,WhiteIdentitarian,StateFunded,0.0675941201
366,WhiteIdentitarian,TV,0.0558492337


In [11]:
#Creates the Markov Matrix
#test = test.groupby(["Current Ideology","Next Ideology"]).sum().reset_index()
topics = test["Current Tag"].unique()
markovmatrix = pd.DataFrame()
for topic in topics:
    temp = test[test["Current Tag"] == topic].transpose().rename(columns = test["Next Tag"], index = {"Probability":topic}).iloc[2:]
    markovmatrix = markovmatrix.append(temp)
markovmatrix = markovmatrix.fillna(0)
markovmatrix = markovmatrix.reindex(sorted(markovmatrix.columns),axis = 1)#.drop(index = "Others")
markovmatrix
#markovmatrix.to_csv("Markov Matrix.csv")

In [10]:
#Markov Steady State Simulation
#matrix = pd.read_csv("Markov Matrix without Others.csv").drop(columns = "Unnamed: 0")
vec = np.zeros((len(markovmatrix),1))
vec[0:len(markovmatrix)] = 1/len(markovmatrix)

for _ in range(100):
    vec = np.dot(np.transpose(markovmatrix),vec)

probdf = pd.DataFrame(vec, index = ['PartisanRight','Conspiracy','PartisanLeft','SocialJustice',
 'Libertarian','AntiSJW', 'MissingLinkMedia','WhiteIdentitarian','AIN',
 'ManoelAltRight','MRA','ReligiousConservative','ManoelIDW','Provocateur',
 'ManoelAltLite','Socialist','Mainstream News','TV', 'AntiTheist', 'StateFunded',
 'Educational', 'LateNightTalkShow', 'Revolutionary', 'AntiWhiteness',
 'Politician'])
probdf = probdf.rename({0:"Steady State Probability"},axis = 1)
probdf
#probdf.to_csv("Steady State Probability.csv")

Unnamed: 0,Steady State Probability
PartisanRight,0.0021027833
Conspiracy,0.0164516312
PartisanLeft,0.0043491423
SocialJustice,0.0013905609
Libertarian,0.0057825876
AntiSJW,0.0197931623
MissingLinkMedia,0.1745658628
WhiteIdentitarian,0.0158066955
AIN,2.51279e-05
ManoelAltRight,0.042488216


This model is scaled taking into accounts of video views, thus it shows what video types more people are exposed towards

In [7]:
uniquetags = ['PartisanRight','Conspiracy','PartisanLeft','SocialJustice',
 'Libertarian','AntiSJW', 'MissingLinkMedia','WhiteIdentitarian','AIN',
 'ManoelAltRight','MRA','ReligiousConservative','ManoelIDW','Provocateur',
 'ManoelAltLite','Socialist','Mainstream News','TV', 'AntiTheist', 'StateFunded',
 'Educational', 'LateNightTalkShow', 'Revolutionary', 'AntiWhiteness',
 'Politician']


temp = pd.read_csv("combined tags.csv").drop(columns = "Unnamed: 0")

for tag in uniquetags:
    if not (tag in temp["Category"].unique()):
        print(tag)

In [8]:
#temp[temp["Category"] == "Anti-white"]