In [4]:
from tqdm import tqdm
import os
import twint
import pandas as pd
import numpy as np
from collections import defaultdict
from collections import Counter
import csv
import math
from math import log2

# Model

In [5]:
matrix = np.load('matrix.npy')
bothMatrix = np.load('bothMatrix.npy')
targetArray = np.load('tagetArray.npy')
topTeamsListMessy = np.load('topTeamsList.npy')

def cleanTeamsList(teamsList):
    cleanList = []
    for user in teamsList:
        cleanUser = user[2:-3]
        cleanList.append(cleanUser)
    return cleanList

topTeamsList = cleanTeamsList(topTeamsListMessy)


In [6]:
from sklearn.naive_bayes import BernoulliNB

def evaluate(target, prediction):
    zipped = zip(target, prediction)
    FP = 0
    TP = 0
    FN = 0
    TN = 0
    for ans, pred in zipped:
        #print(ans, pred)
        if ans < pred:
            FP += 1
        elif ans > pred:
            FN += 1
        elif ans == 0:
            TP += 1
        elif ans == 1:
            TN += 1
        else:
            #print("error")
            pass
            
    return TP, TN, FP, FN


def train(trainingMatrix, trainingTargetArray, testMatrix, testTargetArray):
    model = BernoulliNB()
    model.fit(trainingMatrix, trainingTargetArray)

    prediction = model.predict(testMatrix)

    return evaluate(testTargetArray, prediction)

#evaluate(targetArray, matrix)

In [7]:
trainX = matrix[25:-25]
trainY = targetArray[25:-25]
testX = np.concatenate((matrix[:25], matrix[-25:]), axis = 0)
testY = np.concatenate((targetArray[:25], targetArray[-25:]), axis = 0)

In [8]:
TP, TN, FP, FN = train(trainX, trainY, testX, testY)

In [9]:
accuracy = 1 - ((FP+FN) / (TP+TN+FP+FN))
print(accuracy)

0.6799999999999999


# Entropies

In [10]:
def createSubgroups(matrix, targetArray, teamsList, teamSpec):
    listOfTeam = []
    listOfNotTeam = []
    teamTargetArray = []
    notTeamTargetArray = []
    
    for index, team in enumerate(teamsList):
        if team == teamSpec:
            teamIndex = index
    for index2, person in enumerate(matrix):
        if person[teamIndex] == 1:
            listOfTeam.append(person)
            if int(targetArray[index2]) == 1:
                teamTargetArray.append(1)
            else:
                teamTargetArray.append(0)
        else:
            listOfNotTeam.append(person)
            if int(targetArray[index2]) == 1:
                notTeamTargetArray.append(1)
            else:
                notTeamTargetArray.append(0)
    return listOfTeam, listOfNotTeam, teamTargetArray, notTeamTargetArray

listOfTeam, listOfNotTeam, teamTargetArray, notTeamTargetArray = createSubgroups(matrix, targetArray, topTeamsList, 'KamalaHarris')

print("KAMALAHARRIS")
print("Length of team matrix: ", len(listOfTeam))
print("Length of NOT team matrix: ", len(listOfNotTeam))


KAMALAHARRIS
Length of team matrix:  130
Length of NOT team matrix:  312


In [11]:
def findTrumpBiden(group, targetArray):
    countTrump = 0
    countBiden = 0
    for index, person in enumerate(group):
        if targetArray[index] == 1:
            countTrump += 1
        else:
            countBiden += 1
    return countTrump, countBiden
countTrump, countBiden = findTrumpBiden(listOfTeam, teamTargetArray)
print("POTUS Biden: ", countBiden)
print("POTUS Trump: ", countTrump)

POTUS Biden:  101
POTUS Trump:  29


In [12]:
def shannonEntropy(vector):

    listOfElements = []
    for p in vector:
        if p > 0:
            element = p*log2(p)
            listOfElements.append(element)
        else:
            element = 0
            listOfElements.append(element)
        entropy = -1 * sum(listOfElements)
    return entropy

In [13]:
def findWeightedShannonEntropy(groupTeam, teamTargetArray, matrix):
    countTrumpTeam, countBidenTeam = findTrumpBiden(groupTeam, teamTargetArray)
    if (countTrumpTeam + countBidenTeam) != 0:
        probTrumpTeam = countTrumpTeam / (countTrumpTeam + countBidenTeam)
        probBidenTeam = countBidenTeam / (countTrumpTeam + countBidenTeam)
    else:
        probTrumpTeam = 0
        probBidenTeam = 0

    probTeamVector = [probTrumpTeam, probBidenTeam]
    sEntropy = shannonEntropy(probTeamVector)
    wae = sEntropy * (len(groupTeam)/len(matrix))
    return wae

waeTeam = findWeightedShannonEntropy(listOfTeam, teamTargetArray, matrix)
waeNotTeam = findWeightedShannonEntropy(listOfNotTeam, notTeamTargetArray, matrix)
print("Weighted Average Entropy of team of POTUS User: ", waeTeam)
print("Weighted Average Entropy of NOT team of POTUS User:", waeNotTeam)

Weighted Average Entropy of team of POTUS User:  0.22521947302504128
Weighted Average Entropy of NOT team of POTUS User: 0.6666899774749303


In [14]:
def getSplitWAE(groupTeam, groupNotTeam, teamTargetArray, notTeamTargetArray, matrix):
        wae1 = findWeightedShannonEntropy(groupTeam, teamTargetArray, matrix)
        
        wae2 = findWeightedShannonEntropy(groupNotTeam, notTeamTargetArray, matrix)

        return (wae1 + wae2)

In [15]:
def createWAEDict(matrix, targetArray, topTeams):
    dictWAE = defaultdict(float)
    for index, team in enumerate(topTeams):
        groupTeam, groupNotTeam, TeamTargetArray, notTeamTargetArray = createSubgroups(matrix, targetArray, topTeams, team)
        teamWAE = getSplitWAE(groupTeam, groupNotTeam, TeamTargetArray, notTeamTargetArray, matrix)
        dictWAE[team] = teamWAE
    return dictWAE

newDict = createWAEDict(matrix, targetArray, topTeamsList)
print(newDict)

defaultdict(<class 'float'>, {'POTUS': 0.9847082431206808, 'BarackObama': 0.9469477457556994, 'VP': 0.9754961948047909, 'WhiteHouse': 0.9971988944303234, 'FLOTUS': 0.9912741149191607, 'KamalaHarris': 0.8919094504999716, 'DonaldJTrumpJr': 0.9726149077658671, 'IvankaTrump': 0.9824071497907477, 'CNN': 0.9914258421702105, 'cnnbrk': 0.9938625026624008, 'Mike_Pence': 0.9841214878617554, 'NASA': 0.9965301854861005, 'HillaryClinton': 0.986373901204091, 'PressSec': 0.9769976948390625, 'SidneyPowell1': 0.9546119031023518, 'elonmusk': 0.9949830565503922, 'nytimes': 0.9891788867669332, 'TheEllenShow': 0.9969636500691461, 'seanhannity': 0.9897370409809847, 'kayleighmcenany': 0.9718664627470197, 'jimmyfallon': 0.9990509447928101, 'washingtonpost': 0.9979337876652674, 'MichelleObama': 0.958975521722012, 'SenKamalaHarris': 0.9878352736471424, 'BBCBreaking': 0.989034902679274, 'RudyGiuliani': 0.9780513590165649, 'TrumpWarRoom': 0.9705844310763023, 'AOC': 0.972751128504824, 'KimKardashian': 0.9820060612

In [27]:
def findLowestSplit(dictOfWAE, matrix, topTeams, targetArray):
    sortedDict = sorted(dictOfWAE.items(), key = lambda x: x[1], reverse = False)
    print(sortedDict)
    def sortedWAEDict(sortedDict):
        if int(len(sortedDict)) == 0:
            return None
        group1, group2, TargetArray1, TargetArray2 = createSubgroups(matrix, targetArray, topTeams, sortedDict[0][0])

        if int(len(group1)) != 0 and int(len(group2)) != 0:
            return str(sortedDict[0][0])
        else:
            del sortedDict[0]
            sortedWAEDict(sortedDict)
    team = sortedWAEDict(sortedDict)
    return team
"""def findLowestSplit(dictOfWAE, matrix, topTeams, targetArray):
    sortedDict = sorted(dictOfWAE.items(), key = lambda x: x[1], reverse = False)
    return sortedDict[0][0]"""
team = findLowestSplit(newDict, matrix, topTeamsList, targetArray)
print("Lowest entropy: ")
print(team)

[('KamalaHarris', 0.8919094504999716), ('BarackObama', 0.9469477457556994), ('ArianaGrande', 0.9529453112140598), ('SidneyPowell1', 0.9546119031023518), ('MichelleObama', 0.958975521722012), ('rihanna', 0.9647668343235515), ('NICKIMINAJ', 0.9660898111710107), ('KylieJenner', 0.9695709730344851), ('TrumpWarRoom', 0.9705844310763023), ('kayleighmcenany', 0.9718664627470197), ('DonaldJTrumpJr', 0.9726149077658671), ('AOC', 0.972751128504824), ('LLinWood', 0.9728354972088535), ('ladygaga', 0.9735941794074161), ('EmmanuelMacron', 0.975267059532326), ('VP', 0.9754961948047909), ('PressSec', 0.9769976948390625), ('charlidamelio', 0.9773371153637272), ('FCBarcelona', 0.9773371153637272), ('RudyGiuliani', 0.9780513590165649), ('selenagomez', 0.9788356392732165), ('Cristiano', 0.9788454356939837), ('whoisaddison', 0.9808860765760709), ('kourtneykardash', 0.9808860765760709), ('Drake', 0.9812482269760348), ('khloekardashian', 0.9812482269760348), ('KimKardashian', 0.9820060612824858), ('IvankaTru

In [26]:
def createTree(matrix, topTeams, targetArray):
    if len(matrix) < 5:
        return ""
    waeDict = createWAEDict(matrix, targetArray, topTeams)
    user = findLowestSplit(waeDict, matrix, topTeams, targetArray)
    if user != None:
        group1, group2, targetArray1, targetArray2 = createSubgroups(matrix, targetArray, topTeams, user)
    else:
        return ""
    print()
    print("Split by: ", user)
    print("Weighted entropy: ", waeDict[user])

    if len(group1) >= len(group2):
        largestGroup = group1
        largestTA = targetArray1
        smallerGroup = group2
    else:
        largestGroup = group2
        largestTA = targetArray2
        smallerGroup = group1

    print("Bigger group size: ", np.array(largestGroup).shape)
    print("Smaller group size: ", np.array(smallerGroup).shape)

    return createTree(largestGroup, topTeams, largestTA)
        

createTree(matrix, topTeamsList, targetArray)

    
    


Split by:  KamalaHarris
Weighted entropy:  0.8919094504999716
Bigger group size:  (312, 84)
Smaller group size:  (130, 84)

Split by:  ArianaGrande
Weighted entropy:  0.8978935697462439
Bigger group size:  (288, 84)
Smaller group size:  (24, 84)

Split by:  MichelleObama
Weighted entropy:  0.8715800100828932
Bigger group size:  (275, 84)
Smaller group size:  (13, 84)

Split by:  DonaldJTrumpJr
Weighted entropy:  0.8461128866074614
Bigger group size:  (256, 84)
Smaller group size:  (19, 84)

Split by:  whoisaddison
Weighted entropy:  0.8633097917268996
Bigger group size:  (249, 84)
Smaller group size:  (7, 84)

Split by:  50cent
Weighted entropy:  0.8596426167495179
Bigger group size:  (245, 84)
Smaller group size:  (4, 84)

Split by:  SidneyPowell1
Weighted entropy:  0.8483394049965204
Bigger group size:  (233, 84)
Smaller group size:  (12, 84)

Split by:  EmmanuelMacron
Weighted entropy:  0.8699438061075507
Bigger group size:  (230, 84)
Smaller group size:  (3, 84)

Split by:  KylieJ

''