In [47]:
from multiprocessing import Pool, cpu_count
import gc; gc.enable()
import pandas as pd
import numpy as np

train = pd.read_csv('raw_data/train.csv')
test = pd.read_csv('raw_data/sample_submission_zero.csv')

transactions = pd.read_csv('cleaned_input/transactions.csv', usecols=['msno'])
transactions = pd.DataFrame(transactions['msno'].value_counts().reset_index())
transactions.columns = ['msno','trans_count']
train = pd.merge(train, transactions, how='left', on='msno')
test = pd.merge(test, transactions, how='left', on='msno')
transactions = []; print('transaction merge...')

user_logs = pd.read_csv('raw_data/user_logs.csv', usecols=['msno'])
user_logs = pd.DataFrame(user_logs['msno'].value_counts().reset_index())
user_logs.columns = ['msno','logs_count']
train = pd.merge(train, user_logs, how='left', on='msno')
test = pd.merge(test, user_logs, how='left', on='msno')
user_logs = []; print('user logs merge...')

members = pd.read_csv('cleaned_input/members.csv')
train = pd.merge(train, members, how='left', on='msno')
test = pd.merge(test, members, how='left', on='msno')
members = []; print('members merge...') 

In [29]:
transactions = pd.read_csv('raw_data/transactions.csv')
transactions = transactions.sort_values(by=['transaction_date'], ascending=[False]).reset_index(drop=True)
transactions = transactions.drop_duplicates(subset=['msno'], keep='first')

train = pd.merge(train, transactions, how='left', on='msno')
test = pd.merge(test, transactions, how='left', on='msno')
transactions=[]

In [30]:
train.head()

In [31]:
def transform_df(df):
    df = pd.DataFrame(df)
    df = df.sort_values(by=['date'], ascending=[False])
    df = df.reset_index(drop=True)
    df = df.drop_duplicates(subset=['msno'], keep='first')
    return df

def transform_df2(df):
    df = df.sort_values(by=['date'], ascending=[False])
    df = df.reset_index(drop=True)
    df = df.drop_duplicates(subset=['msno'], keep='first')
    return df

df_iter = pd.read_csv('raw_data/user_logs.csv', low_memory=False, iterator=True, chunksize=10000000)
last_user_logs = []
i = 0 #~400 Million Records - starting at the end but remove locally if needed
for df in df_iter:
    if i>35:
        if len(df)>0:
            print(df.shape)
            p = Pool(cpu_count())
            df = p.map(transform_df, np.array_split(df, cpu_count()))   
            df = pd.concat(df, axis=0, ignore_index=True).reset_index(drop=True)
            df = transform_df2(df)
            p.close(); p.join()
            last_user_logs.append(df)
            print('...', df.shape)
            df = []
    i+=1

last_user_logs = pd.concat(last_user_logs, axis=0, ignore_index=True).reset_index(drop=True)
last_user_logs = transform_df2(last_user_logs)

train = pd.merge(train, last_user_logs, how='left', on='msno')
test = pd.merge(test, last_user_logs, how='left', on='msno')
last_user_logs=[]

In [32]:
print(train.shape)
print(test.shape)

train.to_csv("cleaned_input/train_merged.csv")
test.to_csv("cleaned_input/test_merged.csv")

In [33]:
data = pd.concat(( train, test ))
train_cutoff = len(train)




In [34]:
#if missing >3 cells, remove row 
data = data.dropna(thresh=3)

In [35]:
#remove outliers 

In [36]:
#fill in missing data points and encode variables
from helpers import *

nan_zero = {"logs_count":0}
nan_dif = {"city":-1, "registered_via":-1, "payment_method_id":-1, "is_auto_renew":-1, "is_cancel":-1}
nan_med = {"bd": 'med', "registration_init_date": 'med', "expiration_date_month":'med', "expiration_date_date":'med',
           "expiration_date_year": 'med', "registration_init_year":'med', "registration_init_month":'med', 
           "registration_init_date":'med', "date":'med', "num_25": 'med', "num_50": 'med', "num_75": 'med', "num_985": 'med', 
           "num_100": 'med', "num_unq": 'med', "Unnamed: 0": 'med', "total_secs": 'med', "trans_count":'med', 'transaction_date':'med',
          'membership_expire_date':'med', 'plan_list_price':'med', 'actual_amount_paid':'med', 'payment_plan_days':'med', "registration_init_time": 'med',
          'expiration_date':'med'}
nan_cat = {"gender":"missing"}

data = data.drop(["Unnamed: 0", "gender"], axis = 0)
data = to_fill_na(data, nan_zero)
data = to_fill_na(data, nan_dif)
data = to_fill_na(data, nan_med)
data = to_fill_na(data, nan_cat)
train.head()


In [40]:
data.isnull().any()

In [38]:
#feature engineering #not working yet

data['discount'] = data['plan_list_price'] - data['actual_amount_paid']
data['is_discount'] = data.discount.apply(lambda x: 1 if x > 0 else 0)
data['amt_per_day'] = data['actual_amount_paid'] / data['payment_plan_days']
date_cols = ['transaction_date', 'membership_expire_date']
for col in date_cols:
    data[col] = pd.to_datetime(data[col], format='%Y%m%d')
    
#--- difference in days ---
data['membership_duration'] = data.membership_expire_date - data.transaction_date
data['membership_duration'] = data['membership_duration'] / np.timedelta64(1, 'D')
data = to_fill_na(data, {'membership_duration':'med'})
data['membership_duration'] = data['membership_duration'].astype(int)

 
#---difference in months ---
data['membership_duration_M'] = (data.membership_expire_date - data.transaction_date)/ np.timedelta64(1, 'M')
data['membership_duration_M'] = round(data['membership_duration_M']).astype(int)
data['membership_duration_M'].head()

date_cols = ['registration_init_time', 'expiration_date']

for col in date_cols:
    data[col] = pd.to_datetime(data[col], format='%Y%m%d')

#--- difference in days ---
data['registration_duration'] = data.expiration_date - data.registration_init_time
data['registration_duration'] = data['registration_duration'] / np.timedelta64(1, 'D')
data = to_fill_na(data, {'registration_duration':'med'})
data['registration_duration'] = data['registration_duration'].astype(int)

#---difference in months ---
data['registration_duration_M'] = (data.expiration_date - data.registration_init_time)/ np.timedelta64(1, 'M')
data = to_fill_na(data, {'registration_duration_M':'med'})
data['registration_duration_M'] = round(data['registration_duration_M']).astype(int)

data['reg_mem_duration'] = data['registration_duration'] - data['membership_duration']
to_fill_na(data, {'reg_mem_duration':'med'})
data['reg_mem_duration_M'] = data['registration_duration_M'] - data['membership_duration_M']

data['notAutorenew_&_cancel'] = ((data.is_auto_renew == 0) == (data.is_cancel == 1)).astype(np.int8)
data['notAutorenew_&_cancel'].unique()

data['long_time_user'] = (((data['registration_duration'] / 365).astype(int)) > 1).astype(int)


#feature ideas to implement
#- price paid/ duration 
#- list price/ duration 
#- number of unique songs/ price paid
#- number of unique songs/ list price
#-num25, num50 etc.../ price paid & list price
# discount percentage (list price - price paid)/ list price 
# convert date to day of the week 
# total secs / membersherip duration/days and months 
# days to expiration (expiration date - date)
# transaction year, month, day, time, (make all seperate variables)



In [42]:
from helpers import *
#change to dummy variable 

categorical_to_binarizer = ["gender"]
data = label_binarizer(data,categorical_to_binarizer)

categorical_to_encode = ["city", "gender", "registered_via", "payment_method_id", "is_auto_renew", "is_cancel"]
data = label_encoder(data, categorical_to_encode)

variables_to_drop = categorical_to_encode + ["date", "registration_init_time", "expiration_date", "transaction_date", "membership_expire_date"] 
data = data.drop(variables_to_drop, axis=0)

In [46]:
data.dtypes

In [39]:
train = data[:train_cutoff]
test = data[train_cutoff:]
print(train.shape)
print(test.shape)

train.to_csv("cleaned_input/train_consolidated.csv")
test.to_csv("cleaned_input/test_consolidated.csv")