In [25]:
import numpy as np
import pandas as pd

In [26]:
df = pd.read_csv('yayes_dfs.csv')
df['timestp'] = pd.to_datetime(df['timestp'])
artist_list = df['cm_artist'].unique()

In [120]:
%%time
artists_patched = []

for i in artist_list:
    
    temp = df[df['cm_artist'] == i].copy()
    temp['followers_diff'] = temp['followers'].diff()
    artists_patched.append(yayes_og(temp.copy()))

above_histories_patched = pd.concat(artists_patched)

CPU times: user 21.3 s, sys: 32 ms, total: 21.3 s
Wall time: 21.3 s


In [121]:
%%time
df_yayes = []
for i in artist_list:
    temp = df[df['cm_artist'] == i].copy()
    temp['followers'] = yayes(temp['followers'])
    df_yayes.append(temp)

df2 = pd.concat(df_yayes)   

CPU times: user 5.26 s, sys: 4 ms, total: 5.26 s
Wall time: 5.25 s


## LOOK AT CASE WHERE FOLLOWERS BEGINS ON DECREASE THEN INCREASE

In [119]:
def yayes(col, X = .15, last_day = False):
    
    #Don't need?
    if type(col) == pd.Series:
        col = col.values
    col = col.astype('float32')
    differences = np.diff(col)
    max_i = len(differences)    

    for i in range(len(col)-2):
        
        current_diff = differences[i]
        next_diff = differences[i+1]
        
        if current_diff >= 0 and next_diff < 0:
            
            j = i + 2
            
            max_diff_threshold = next_diff
            
            while j < max_i and (0 <= differences[j] <= abs(max_diff_threshold * X)):
                
                j += 1
                
            if j < max_i and differences[j] > (.5*max_diff_threshold) and differences[j] > 0:
                fill_value = col[i+1]
                increase = (col[j+1] - col[i+1]) / (j - i)
                
                k, x = i + 2, 1
                
                while k <= j:
                    col[k] = fill_value + (increase * x)
                    k,x = k+1, x+1
                    
                differences = np.diff(col)
    return col        

In [5]:
def yayes_og(artist_df, last_day=False):
    
    '''
    This function is used to impute missing values and treat data errors.  It's primary purpose is to 
    correct false dips and spikes in an artist's follower count over time.
    
    artist_df: a pandas dataframe containing all of the data for one artist
    '''
    
    #Get list of indices to iterate over
    indices = artist_df['followers_diff'].index.values
    
    for i in range(1, len(indices) - 1):
        
        #pandas series containing the differences in follower counts for adjacent entries (e.g. today's count - yesterday's count)
        followers_diff = artist_df['followers_diff']
        followers = artist_df['followers']
        
        f = followers_diff[indices[i]]
        s = followers_diff[indices[i+1]]
        

        if f >= 0 and s < 0:
            j = i+2
            neg = False
            
            while j < len(indices) and followers_diff[indices[j]] <= abs(.15 * s):
                
                if followers_diff[indices[j]] < 0:
                    neg=True
                    break
                    
                j += 1
            
            if neg: continue
            
            elif j >= len(indices):
                continue
                
            elif abs(.95*s) <= followers_diff[indices[j]] <= abs(1.05*s):
                artist_df = lift_data(artist_df, i, j, indices)
            
            elif (followers_diff[indices[j]] > abs(1.05*s)) or (.5*s < followers_diff[indices[j]] < abs(.95*s)):
                artist_df = lift_data(artist_df, i, j, indices)
            
            else:
                continue
    
    if last_day:
        last_foll = artist_df.loc[indices[len(artist_df)-1],'followers_diff']
        last_diff = artist_df.loc[indices[len(artist_df)-1], 'followers'] 
        second_foll = artist_df.loc[indices[len(artist_df)-2], 'followers']
        
        if -artist_df.loc[indices[len(artist_df)-1], 'followers_diff'] > .99*artist_df.loc[indices[len(artist_df)-2], 'followers']:
            artist_df.loc[indices[len(artist_df)-1], 'followers'] = artist_df.loc[indices[len(artist_df)-2], 'followers']
            artist_df.loc[indices[len(artist_df)-1], 'followers_diff'] = 0

    return artist_df
            
def lift_data(artist_df, i, j, indices):
    
    fill_val = artist_df.loc[indices[i],'followers']
            
    time_delta = (artist_df.loc[indices[j],'timestp'] - artist_df.loc[indices[i],'timestp']).days
    followers_delta = artist_df.loc[indices[j], 'followers'] - artist_df.loc[indices[i], 'followers']

    increase = followers_delta/time_delta
    
    i, x = i+1, 1
    
    while i < j:
        artist_df.loc[indices[i], 'followers'] = fill_val + x*increase
        x, i = x + 1, i + 1
    
    artist_df['followers_diff'] = artist_df['followers'].diff()
    
    return artist_df
    