In [4]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display
import os
import glob
import datetime

from scipy.stats import kendalltau
from scipy.special import comb
import math
from Datasets import process 

In [2]:
def krcc_test(x, y):
    if len(x) == len(y):
        n = len(x)

        # default scipy is tau_b
        krcc = kendalltau(x, y)
        
        # https://github.com/mmhs013/pyMannKendall this also finds the tau and mann-kendall score but it doesnt do what we want
        
        print("KRCC =", krcc[0])

        # The two-sided p-value for a hypothesis test whose null hypothesis is an absence of association, tau = 0
        print("p value =", krcc[1])

    else:
        print("inconsistent number of points in x and y")

In [31]:
# Health Organizations
healthorg_df = pd.read_csv('./Processed Datasets/healthorg.csv')
healthorg_df = process.filter_dates(healthorg_df)


In [13]:
# News Outlets
leftnews_df = pd.read_csv('./Processed Datasets/leftnews.csv')
leftnews_df = process.filter_dates(leftnews_df)

rightnews_df = pd.read_csv('./Processed Datasets/rightnews.csv')
rightnews_df = process.filter_dates(rightnews_df)



In [10]:
# Individuals
leftind_df = pd.read_csv('./Processed Datasets/leftind.csv')
leftind_df = process.filter_dates(leftind_df)

rightind_df = pd.read_csv('./Processed Datasets/rightind.csv')
rightind_df = process.filter_dates(rightind_df)

celeb_df = pd.read_csv('./Processed Datasets/celebrities.csv')
celeb_df = process.filter_dates(celeb_df)

In [29]:
# fake / real
# still need to add covid_lies to fake data
fake_df = pd.read_csv('./Processed Datasets/fake.csv')
fake_df = process.filter_dates(fake_df)
real_df = pd.read_csv('./Processed Datasets/real.csv')
real_df = process.filter_dates(real_df)

In [14]:
# 1-Day Bin

healthorg_dates = healthorg_df.sort_values("date", ascending=True)["date"].value_counts()
rightind_dates = rightind_df.sort_values("date", ascending=True)["date"].value_counts()
leftind_dates = leftind_df.sort_values("date", ascending=True)["date"].value_counts()
rightnews_dates = rightnews_df.sort_values("date", ascending=True)["date"].value_counts()
leftnews_dates = leftnews_df.sort_values("date", ascending=True)["date"].value_counts()
celeb_dates = celeb_df.sort_values("date", ascending=True)["date"].value_counts()

fake_tweets = fake_df.sort_values("date", ascending=True)["date"].value_counts()
real_tweets = real_df.sort_values("date", ascending=True)["date"].value_counts()


In [18]:
# Bin
bins = '3D'

healthorg_df["date"] = pd.to_datetime(healthorg_df["date"])
rightind_df["date"] = pd.to_datetime(rightind_df["date"])
leftind_df["date"] = pd.to_datetime(leftind_df["date"])
rightnews_df["date"] = pd.to_datetime(rightnews_df["date"])
leftnews_df["date"] = pd.to_datetime(leftnews_df["date"])
celeb_df["date"] = pd.to_datetime(celeb_df["date"])
real_df["date"] = pd.to_datetime(real_df["date"])
fake_df["date"] = pd.to_datetime(fake_df["date"])

health_bins = pd.Series(index=healthorg_df.date, data=np.array(healthorg_df.count)).resample(bins).count().sort_index()
rightind_bins = pd.Series(index=rightind_df.date, data=np.array(rightind_df.count)).resample(bins).count().sort_index()
leftind_bins = pd.Series(index=leftind_df.date, data=np.array(leftind_df.count)).resample(bins).count().sort_index()
rightnews_bins = pd.Series(index=rightnews_df.date, data=np.array(rightnews_df.count)).resample(bins).count().sort_index()
leftnews_bins = pd.Series(index=leftnews_df.date, data=np.array(leftnews_df.count)).resample(bins).count().sort_index()
celeb_bins = pd.Series(index=celeb_df.date, data=np.array(celeb_df.count)).resample(bins).count().sort_index()
real_bins = pd.Series(index=real_df.date, data=np.array(real_df.count)).resample(bins).count().sort_index()
fake_bins = pd.Series(index=fake_df.date, data=np.array(fake_df.count)).resample(bins).count().sort_index()


In [17]:
print("HealthOrg Tweets: \t{}\nLeft Tweets: \t{}\nRight Tweets: \t{}\nCeleb Tweets: \t{}\nLeft News Outlet Tweets: \t{}\nRight News Outlet Tweets: \t{}\nFake Tweets:\t\t{}\nReal Tweets:\t\t{}\n".format(len(health_bins), len(leftind_bins), len(rightind_bins), len(celeb_bins), len(leftnews_bins), len(rightnews_bins), len(fake_bins), len(real_bins)))


HealthOrg Tweets: 	29
Left Tweets: 	29
Right Tweets: 	29
Celeb Tweets: 	29
Left News Outlet Tweets: 	29
Right News Outlet Tweets: 	29
Fake Tweets:		29
Real Tweets:		29



In [168]:
# only look at similar dates... so length is the same, need to find a better way

print("real and health")
krcc_test(real_bins, health_bins)
print("real and left")
krcc_test(real_bins, left_bins)
print("real and right")
krcc_test(real_bins, right_bins)
print("real and celeb")
krcc_test(real_bins[2:], celeb_bins)
print("real and left news")
krcc_test(real_bins, leftnews_bins)
print("real and right news")
krcc_test(real_bins, rightnews_bins)


real and health
KRCC = 0.03970235548035083
p value = 0.763810605835433
real and left
KRCC = 0.35323820252454613
p value = 0.00763720626819773
real and right
KRCC = 0.19775973020584628
p value = 0.13733232810844748
real and celeb
KRCC = -0.29670681572524854
p value = 0.05534087166996541
real and left news
KRCC = 0.25679090623776135
p value = 0.05099459850193937
real and right news
KRCC = 0.2398024081696029
p value = 0.06868418589158591


In [148]:
print("fake and health")
krcc_test(fake_bins, health_bins)
print("fake and left")
krcc_test(fake_bins, left_bins)
print("fake and right")
krcc_test(fake_bins, right_bins)
print("fake and celeb")
krcc_test(fake_bins[2:], celeb_bins)
print("fake and left news")
krcc_test(fake_bins, leftnews_bins)
print("fake and right news")
krcc_test(fake_bins, rightnews_bins)

fake and health
KRCC = -0.04975124378109452
p value = 0.707116174085537
fake and left
KRCC = 0.24039819146839242
p value = 0.06858686626191852
fake and right
KRCC = 0.3850048125902362
p value = 0.0037896496391532984
fake and celeb
inconsistent number of points in x and y
fake and left news
KRCC = 0.2623794527627235
p value = 0.046617897920194246
fake and right news
KRCC = 0.21561487276031072
p value = 0.10238783821511202
