In [560]:
import scipy.stats as scs
import pandas as pd
import src.clean as clean
%autoreload
import numpy as np
import re 
import matplotlib.pyplot as plt
%matplotlib inline
import pickle
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import silhouette_score
from sklearn.model_selection import cross_val_score
from sklearn import preprocessing
from sklearn.cluster import KMeans
from gensim.models.word2vec import Word2Vec
from nltk.stem.porter import PorterStemmer


# Getting my data

In [5]:
qualdf = pd.read_csv('soft_content.txt', sep="|")

In [6]:
pickle_in2 = open("quant_data.pkl","rb")
quant_df = pickle.load(pickle_in2)

In [609]:
combined = clean.combine_data(qualdf,quant_df)
combined2 = clean.clean_combined(combined)
combined2.dropna(axis =0, inplace=True)

In [610]:
vocab, hashtag_tfidf = clean.make_hashtag_tfidf(combined2)
combined2.drop(['hashtags'],axis=1, inplace=True)

In [611]:
for rep in (serious_reps + mid_reps+ lesser_reps):
    combined2[rep] = combined2.apply(lambda x: is_tagged(rep, x.people_tagged), axis=1)

# Adding caption sentiment vector to data

In [604]:
p_stemmer = PorterStemmer()

captions = list(combined2['caption'])
for thing in ['\n','\+','\+','\-','\/\/','\.','\#.+','\@.+']:
    captions = [re.sub(thing, ' ', caption) for caption in captions]
captions=[captions[i].split() for i in range(len(captions))]
captions=[[p_stemmer.stem(i) for i in word] for word in captions]


model = Word2Vec(captions,min_count=2)
vocab=list(model.wv.vocab.keys())

#making sentiment vector
vecs = []
for caption in captions:
    avg = np.zeros((100,))
    if len(caption)>0:
        for word in caption:
            if word in vocab:
                avg += model[word]
        avg /= len(caption)
    vecs.append(avg)
vecs = np.array(vecs)

#applying it to dataframe
for i in range(100):
    c_name = 'v-{}'.format(i)
    combined2[c_name] = (vecs[:,i])
    

combined2.drop(['caption'], axis=1, inplace=True)    



In [612]:
combined5 = pd.concat([combined2, hashtag_tfidf], axis=1)


In [613]:
combined5.drop(['people_tagged','caption'], axis=1, inplace = True)

In [614]:
yt = combined5['number_of_likes']
Xt = combined5.drop(['number_of_likes'], axis=1)


In [419]:
y = combined3['number_of_likes']
X = combined3.drop(['number_of_likes','people_tagged'], axis=1)


In [312]:
gbr = GradientBoostingRegressor(learning_rate=0.01, loss='ls',
                                min_samples_split=4, n_estimators=600,
                                max_depth=5)

In [420]:
scores = cross_val_score(gbr, X, y,cv=40, scoring= 'neg_mean_absolute_error')
print(scores.mean())
print(scores)

-46.05862157769279
[-47.59719965 -31.91220128 -59.93857042 -52.79252065 -58.87740695
 -68.86928371 -43.47176227 -62.0000536  -40.84084133 -32.98865437
 -54.39688262 -40.07191467 -38.59252294 -54.08261875 -67.64614904
 -39.90402162 -29.67173096 -30.91346797 -38.81827282 -30.60898776
 -33.253929   -30.92636486 -35.11192005 -33.41871624 -39.80398938
 -38.05325534 -46.99220831 -63.52879121 -55.96233592 -51.34886899
 -29.73829268 -43.12561171 -44.22743357 -57.45717087 -40.68882267
 -71.69639479 -73.73298127 -36.01040084 -60.32639578 -32.94591624]


In [547]:
scores = cross_val_score(gbr, Xt, yt,cv=40, scoring= 'neg_mean_absolute_error')
print(scores.mean())
print(scores)

-45.870712802938456
[-47.92614977 -31.53957189 -57.36399406 -52.80441282 -58.93310815
 -69.53039612 -44.01725658 -62.19672742 -40.90022451 -32.92311859
 -54.3268601  -38.82654959 -38.25799167 -53.86910004 -67.39091699
 -40.47056063 -28.8205207  -31.10729655 -38.002623   -30.95409393
 -33.41901201 -29.98771856 -34.77709141 -33.43249243 -40.27981662
 -38.29273263 -47.20043375 -63.54573243 -55.50962503 -51.20210596
 -28.89832739 -43.54328847 -44.24525769 -55.58148922 -40.72554516
 -71.96893576 -73.15294708 -35.9056808  -60.39662878 -32.60217784]


In [615]:
scores = cross_val_score(gbr, Xt, yt,cv=40, scoring= 'neg_mean_absolute_error')
print(scores.mean())
print(scores)

-46.044768960085534
[-48.02338663 -31.26848474 -58.98372277 -52.04706585 -57.90181546
 -69.44027141 -42.26590117 -62.09513902 -41.12290563 -33.12251515
 -53.85200224 -38.36440277 -38.54388678 -60.75085037 -69.10517438
 -39.75066423 -28.95043108 -31.61201852 -39.25736931 -31.58221179
 -34.0353694  -31.15436049 -34.93259906 -33.26652097 -39.60769088
 -37.65134646 -47.63167466 -62.9442924  -55.0925348  -50.7949
 -28.13952716 -42.96041957 -44.10165921 -55.61469638 -41.2268971
 -72.38781074 -73.45698218 -35.46124784 -59.69198801 -33.59802181]


In [620]:
def bootstrap(X, y ,n=25): 
    feat_imps=[]
    for i in range(n):
        b_ind = np.random.choice(range(len(X)), size=len(X), replace=True)
        y = y.iloc[b_ind]
        X = X.iloc[b_ind]
        gbr.fit(X,y)
        feat_imps.append(gbr.feature_importances_)
    return feat_imps    

In [None]:
feats = bootstrap(Xt,yt, n=100)

In [629]:
lower = np.percentile(feats, .05, axis=0)
upper = np.percentile(feats, .95, axis=0)

In [634]:
for name, low, up in zip(list(Xt.columns),lower,upper):
    print ("{}: ({} - {})".format(name, round(low,4),round(up,4)))

DOW_0: (0.0124 - 0.0124)
DOW_1: (0.0019 - 0.002)
DOW_2: (0.0004 - 0.0004)
DOW_3: (0.0004 - 0.0005)
DOW_4: (0.0001 - 0.0001)
DOW_5: (0.0005 - 0.0007)
DOW_6: (0.0008 - 0.0008)
hour: (0.0338 - 0.0342)
month: (0.107 - 0.1078)
num_people_tagged: (0.0081 - 0.0081)
num_people: (0.0018 - 0.0018)
faces_visible: (0.0017 - 0.0017)
sale: (0.0 - 0.0)
edited: (0.0023 - 0.0024)
butt_pic: (0.0 - 0.0001)
ocean: (0.0005 - 0.0005)
skate: (0.0002 - 0.0002)
drift_content: (0.0005 - 0.0005)
product_shot: (0.0058 - 0.0058)
lifestyle: (0.0005 - 0.0005)
in_shop: (0.0016 - 0.0016)
pro\model: (0.0013 - 0.0014)
in_nature: (0.0002 - 0.0002)
surf: (0.0006 - 0.0006)
male: (0.001 - 0.001)
female: (0.0019 - 0.0019)
bikini: (0.0004 - 0.0004)
apparel: (0.0005 - 0.0005)
thedriftcollective: (0.0009 - 0.0009)
elisabettalockhart: (0.0003 - 0.0003)
sea__soul: (0.0021 - 0.0021)
isabelletodd: (0.0002 - 0.0002)
beershower: (0.0002 - 0.0002)
idforreal: (0.0 - 0.0)
lturpan: (0.0001 - 0.0001)
bubblerock: (0.0001 - 0.0001)
filippae

# adding tagged people information

In [397]:
pickle_in = open("ppl_ever_tagged.pkl","rb")
ppl_ever_tagged = pickle.load(pickle_in)

In [398]:
serious_reps=[]
mid_reps = []
lesser_reps = []
for key, val in ppl_ever_tagged.items():
    if val >= 20:
        serious_reps.append(key)
    if val >= 4 and val < 7:
        lesser_reps.append(key)
    if val >= 7 and val <20:
        mid_reps.append(key)

In [409]:
combined2.columns

Index(['DOW_0', 'DOW_1', 'DOW_2', 'DOW_3', 'DOW_4', 'DOW_5', 'DOW_6',
       'caption', 'hour', 'month', 'num_people_tagged', 'number_of_likes',
       'people_tagged', 'num_people', 'faces_visible', 'sale', 'edited',
       'butt_pic', 'ocean', 'skate', 'drift_content', 'product_shot',
       'lifestyle', 'in_shop', 'pro\model', 'in_nature', 'surf', 'male',
       'female', 'bikini', 'apparel'],
      dtype='object')

In [410]:
def is_tagged(name, tags):
    if name in tags:
        return True
    else:
        return False

In [411]:
for rep in (serious_reps+ mid_reps):
    combined2[rep] = combined2.apply(lambda x: is_tagged(rep, x.people_tagged), axis=1)

In [538]:
for rep in (serious_reps+ mid_reps):
    combined2t[rep] = combined2t.apply(lambda x: is_tagged(rep, x.people_tagged), axis=1)

AttributeError: ("'Series' object has no attribute 'people_tagged'", 'occurred at index http://www.instagram.com/p/BeTdakGHCmh/?taken')