In [280]:
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
import re
import datetime
import csv

# NBA_Date class with relevant functions
class CreationDate:
    def __init__(self, dt):
        self.dt = dt
        self.date = dt.date()
        self.time = dt.time()

    def get_date(self):
        return self.date

    def get_time(self):
        return self.time

    def get_month(self):
        return self.date.strftime("%B")

    def get_month_int(self):
        return self.date.month

    def get_year(self):
        return self.date.year

    def get_weekday(self):
        return self.date.strftime("%A")

    def get_day_section(self):
        if self.time >= datetime.time(1, 0, 0) and self.time < datetime.time(7, 0, 0):
            return "deadzone"
        elif self.time >= datetime.time(7, 0, 0) and self.time < datetime.time(
            11, 0, 0
        ):
            return "morning"
        elif self.time >= datetime.time(11, 0, 0) and self.time < datetime.time(
            13, 0, 0
        ):
            return "lunch"
        elif self.time >= datetime.time(13, 0, 0) and self.time < datetime.time(
            17, 0, 0
        ):
            return "afternoon"
        elif self.time >= datetime.time(17, 0, 0) and self.time < datetime.time(
            21, 0, 0
        ):
            return "evening"
        elif self.time >= datetime.time(21, 0, 0) and self.time < datetime.time(
            23, 30, 0
        ):
            return "night"
        else:
            return "postgame"
    
    def season_yr(self):
        if self.date > datetime.date(2017, 10, 17) and self.date < datetime.date(2018, 6, 8):
            return "2018"
        elif self.date > datetime.date(2018, 10, 16) and self.date < datetime.date(2019, 6, 13):
            return "2019"
        else:
            return ""
    
    def is_season(self):
        if self.season_yr == "2018" or self.season_yr == "2019"
            return True
        else:
            return False
    
    def is_playoffs(self):
        if (self.season_yr == "2018" and self.date.month == 5) or (self.date.month == 4 and self.date.day >= 14) or (self.date.month == 6 and self.date.day <= 8):
            return True
        elif (self.season_yr == "2019" and self.date.month == 5) or (self.date.month == 4 and self.date.day >= 13) or (self.date.month == 6 and self.date.day <= 13):
            return True
        else:
            return False

    def is_finals(self):
        if self.season_yr == "2018" and (self.date.month == 5 and self.date.day >= 31) and (self.date.month == 6 and self.date.day <= 8):
            return True
        elif self.season_yr == "2019" and (self.date.month == 5 and self.date.day >= 30) and (self.date.month == 6 and self.date.day <= 13):
            return True
        else:
            return False


# Functions to convert string into datetime object
def strip_timezone(datetime_str):
    return datetime_str[:-4]


def get_datetime(datetime_str_no_tz):
    return datetime.datetime.strptime(datetime_str_no_tz, "%Y-%m-%d %H:%M:%S")


def date_string_to_datetime(datetime_str, has_timezone=True):
    if has_timezone:
        return get_datetime(strip_timezone(datetime_str))
    else:
        get_datetime(datetime_str)

from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import feature_selection
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

In [281]:
# load data
df = pd.read_excel('training_set_extended.xls')
df.rename(columns={'Followers at Posting': 'Followers'}, inplace=True)
df.head()

holdout = pd.read_csv('holdout_set.csv', encoding='unicode_escape')
holdout.rename(columns={'Followers at Posting': 'Followers'}, inplace=True)
holdout.head()

Unnamed: 0,Engagements,Followers,Created,Type,Description
0,,36984682,2019-05-22 00:31:55 EDT,Photo,The @raptors even up the Eastern Conference Fi...
1,,36984682,2019-05-21 23:20:41 EDT,Photo,The @raptors even the Eastern Conference Final...
2,,36984682,2019-05-21 21:18:51 EDT,Video,@sergeibaka keeps it alive for the @raptors! (...
3,,36955156,2019-05-21 18:20:09 EDT,Video,???? The SUPERHUMAN HANDS of Giannis & Kawhi ?...
4,,36955156,2019-05-21 15:04:31 EDT,Video,?? the @raptors & @bucks in transition! #NBABr...


In [282]:
#### Function to clean word data--removes stopwords, makes lowercase, leaves in numbers and @
def clean_words(sentences):
    words_clean = np.full(len(sentences), None)
    for i, words in enumerate(sentences):
        words = str(words).replace('@', '')
        word_list = re.split('\W+', words)
        words1 = [word.lower() for word in word_list if word.lower() not in stopwords.words('english')]
        #words2 = [word for word in words1 if len(word) >= 3]
       # words2 = [word for word in words1 if not any(char.isdigit() for char in word)]
        words_clean[i] = (' '.join(words1)).strip()
    return words_clean

In [283]:
df['clean_words'] = clean_words(df['Description'])
holdout['clean_words'] = clean_words(holdout['Description'])
df.head()
holdout.head()
cds = [CreationDate(date_string_to_datetime(c)) for c in holdout['Created']]
holdout['datetime'] = [c.dt for c in cds]
holdout['year'] = [c.get_year() for c in cds]
holdout['month'] = [c.get_month() for c in cds]
holdout['month_int'] = [c.get_month_int() for c in cds]
holdout['weekday'] = [c.get_weekday() for c in cds]
holdout['day_time'] = [c.get_day_section() for c in cds]
holdout['season_yr'] = [c.season_yr() for c in cds]
holdout['is_season'] = [c.is_season() for c in cds]
holdout['is_playoffs'] = [c.is_playoffs() for c in cds]
holdout['is_finals'] = [c.is_finals() for c in cds]

Bds = [CreationDate(date_string_to_datetime(c)) for c in df['Created']]
df['season_yr'] = [c.season_yr() for c in Bds]
df['is_season'] = [c.is_season() for c in Bds]
df['is_playoffs'] = [c.is_playoffs() for c in Bds]
df['is_finals'] = [c.is_finals() for c in Bds]

In [284]:
# calculate MAPE for model predictions
def calculate_mape(true, predicted):
    mapes = [abs((t - p) / t) for t, p in zip(true, predicted)]
    return 100 * np.mean(mapes)

In [285]:
nba_teams = pd.read_excel('Instagram Team ID.xlsx')
nba_teams.rename(columns={'Team Name': 'Team_Name'}, inplace=True)
nba_teams.head()

Unnamed: 0,Team_Name,Location,Name,Insta_ID,Number of Posts,Followers
0,Atlanta Hawks,Atlanta,Hawks,atlhawks,7336,1000000
1,Boston Celtics,Boston,Celtics,celtics,7822,4200000
2,Brooklyn Nets,Brooklyn,Nets,brooklynnets,6363,1200000
3,Charlotte Hornets,Charlotte,Hornets,hornets,9653,1000000
4,Chicago Bulls,Chicago,Bulls,chicagobulls,5202,4400000


In [286]:
all_nba_players = ["Giannis Antetokounmpo", "giannis_an34","Giannis","Greek Freek",
           "James Harden","jharden13",
           "Paul George", "ygtrece",
           "Nikola Jokic","Nikola", "Jokic", "Joker",
           "Joel Embiid","joelembiid","Embiid",
           "Kevin Durant","easymoneysniper","Durant",
           "Damian Lillard","damianlillard","Dame",
           "Kawhi Leonard","Kawhi",
           "Russell Westbrook","russwest44","Westbrook","brodie",
           "Blake Griffin","blakegriffin23",
           "Rudy Gobert","rudygobert27","Gobert",
           "Kemba Walker","Kemba","_kw15",
           "Anthony Davis","antdavis23",
           "LaMarcus Aldridge","aldridge_121",
           "DeMar DeRozan","demar_derozan","DeRozan",
           "Jimmy Butler","jimmybutler",
           "Victor Oladipo","vicoladipo",
           "Karl_Anthony Towns","karltowns",
           "Ben Simmons","bensimmons",
           "Bradley Beal","bradbeal3",
           "Dwyane Wade","dwyanewade","DWade",
           "Klay Thompson","klaythompson","Klay",
           "D'Angelo Russell","dloading", "Dlo",
           "Dirk Nowitzki","swish41","Dirk",
           "Khris Middleton","k_mid22",
           "Kyle Lowry","kyle_lowry7",
           "Nikola Vucevic","NikolaVucevic",
           "Andre Drummond","andredrummondd",
           "Al Horford","alhorford",
           "Draymond Green","money23green",
           "Goran Dragic","the_1_dragon","Dragic",
           "John Wall","johnwall",
           "Kevin Love","kevinlove",
           "Kristaps Porzingis","kporzee"]
superstars = ["Lebron James","kingjames","Lebron",
              "Stephen Curry","stephencurry30","Curry",
              "Kyrie Irving","kyrieirving","Kyrie",
             "Kobe Bryant", "@kobebryant","black mamba"]
season_months = ["January","February","March","April","May","October","November","December"]

followers_mentioned = []
all_nba = []
league_type = []
in_season = []
for row in df.itertuples():
    result = 0
    all_nba_check = 0
    league_types_check = 0
    in_season_check = 0
    if ("wnba" or "nbagleague" or "gleague") in row.clean_words:
        league_types_check = 2
        league_type.append(league_types_check)
    else:
        league_types_check = 1
        league_type.append(league_types_check)
    if row.month == "January":
        in_season_check = 1
        in_season.append(in_season_check)
    elif row.month == "February":
        in_season_check = 1
        in_season.append(in_season_check)
    elif row.month == "March":
        in_season_check = 1
        in_season.append(in_season_check)
    elif row.month == "April":
        in_season_check = 1
        in_season.append(in_season_check)
    elif row.month == "May":
        in_season_check = 1
        in_season.append(in_season_check)
    elif row.month == "October":
        in_season_check = 1
        in_season.append(in_season_check)
    elif row.month == "November":
        in_season_check = 1
        in_season.append(in_season_check)
    elif row.month == "December":
        in_season_check = 1
        in_season.append(in_season_check)
    else:
        in_season_check = 0
        in_season.append(in_season_check)
    if any(s.lower() in row.clean_words for s in all_nba_players):
        all_nba_check = 1
        all_nba.append(all_nba_check)
    elif any(p.lower() in row.clean_words for p in superstars):
        all_nba_check = 2
        all_nba.append(all_nba_check)
    else:
        all_nba_check = 0
        all_nba.append(all_nba_check)
    for nbarows in nba_teams.itertuples():
        IG_ID = str(nbarows.Insta_ID)
        Team_Name = str(nbarows.Team_Name)
        Location = str(nbarows.Location)
        Name = str(nbarows.Name)
        #all_nba = str(all_nba)
        if ((Team_Name.lower() in row.clean_words) or (IG_ID.lower() in row.clean_words) or (Location.lower() in row.clean_words) or (Name.lower() in row.clean_words)):
            result += nbarows.Followers
        else:
            result += 0
    followers_mentioned.append(result)
df["Followers_Mentioned"] = followers_mentioned
df["all_nba"] = all_nba
df["league_type"] = league_type
df["in_season"] = in_season
df.to_csv("training_set_extended_4.csv")

In [287]:
followers_mentioned = []
all_nba = []
league_type = []
in_season = []
for row in holdout.itertuples():
    result = 0
    all_nba_check = 0
    league_types_check = 0
    in_season_check = 0
    if ("wnba" or "nbagleague" or "gleague") in row.clean_words:
        league_types_check = 2
        league_type.append(league_types_check)
    else:
        league_types_check = 1
        league_type.append(league_types_check)
    if row.month == "January":
        in_season_check = 1
        in_season.append(in_season_check)
    elif row.month == "February":
        in_season_check = 1
        in_season.append(in_season_check)
    elif row.month == "March":
        in_season_check = 1
        in_season.append(in_season_check)
    elif row.month == "April":
        in_season_check = 1
        in_season.append(in_season_check)
    elif row.month == "May":
        in_season_check = 1
        in_season.append(in_season_check)
    elif row.month == "October":
        in_season_check = 1
        in_season.append(in_season_check)
    elif row.month == "November":
        in_season_check = 1
        in_season.append(in_season_check)
    elif row.month == "December":
        in_season_check = 1
        in_season.append(in_season_check)
    else:
        in_season_check = 0
        in_season.append(in_season_check)
    if any(s.lower() in row.clean_words for s in all_nba_players):
        all_nba_check = 1
        all_nba.append(all_nba_check)
    elif any(p.lower() in row.clean_words for p in superstars):
        all_nba_check = 2
        all_nba.append(all_nba_check)
    else:
        all_nba_check = 0
        all_nba.append(all_nba_check)
    for nbarows in nba_teams.itertuples():
        IG_ID = str(nbarows.Insta_ID)
        Team_Name = str(nbarows.Team_Name)
        Location = str(nbarows.Location)
        Name = str(nbarows.Name)
        #all_nba = str(all_nba)
        if ((Team_Name.lower() in row.clean_words) or (IG_ID.lower() in row.clean_words) or (Location.lower() in row.clean_words) or (Name.lower() in row.clean_words)):
            result += nbarows.Followers
        else:
            result += 0
    followers_mentioned.append(result)
holdout["Followers_Mentioned"] = followers_mentioned
holdout["all_nba"] = all_nba
holdout["league_type"] = league_type
holdout["in_season"] = in_season
holdout.to_csv("holdout_extended.csv")

In [288]:
df = pd.read_csv("training_set_extended_4.csv")
same_day_post = []
for row in df.itertuples():
    same_day_post_check = 0
    nf = df.loc[df.Created != row.Created]
    for date in nf.itertuples():
        created_1 = str(date.Created[0:14])
        created_2 = str(row.Created[0:14])
        if created_1 == created_2 and date.day_time == row.day_time:
            same_day_post_check = 1
            break
        else:
            same_day_post_check = 0
    same_day_post.append(same_day_post_check)
df["same_day_post"] = same_day_post
df.to_csv("training_set_extended_5.csv")

In [289]:
holdout = pd.read_csv("holdout_extended.csv")
same_day_post = []
for row in holdout.itertuples():
    same_day_post_check = 0
    nf = holdout.loc[holdout.Created != row.Created]
    for date in nf.itertuples():
        created_1 = str(date.Created[0:14])
        created_2 = str(row.Created[0:14])
        if created_1 == created_2 and date.day_time == row.day_time:
            same_day_post_check = 1
            break
        else:
            same_day_post_check = 0
    same_day_post.append(same_day_post_check)
holdout["same_day_post"] = same_day_post
holdout.to_csv("holdout_extended_2.csv")