In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

pd.options.mode.chained_assignment = None
data = pd.read_csv("ks-projects-201801_1.csv")

  interactivity=interactivity, compiler=compiler, result=result)


In [2]:
total_data = pd.concat([data])
total_data = total_data.iloc[:, 1:15]
total_data = total_data.rename(columns = {'usd pledged' : 'usd_pledged'})

# 행에 하나라도 na값이 있으면 행 삭제
total_data = total_data.dropna()

#필요없는 (중복되는) 열 삭제
total_data = total_data.drop('goal', 1)
total_data = total_data.drop('pledged', 1)
total_data = total_data.drop('usd_pledged', 1)

#goal에 대한 변수: usd_goal_real
#pledged에 대한 변수: usd_pledged_real
total_data = total_data.drop_duplicates(['name'])

# state열에 canceled, failed, live, successful, suspended가 있는 행만 추려내는 방식으로 고르기
state = ['canceled', 'failed', 'live', 'successful', 'suspended']
total_data = total_data.loc[total_data['state'].isin(state)]

# 뒤의 시간은 없애고 날짜 형식으로 변환
#total_data['deadline'] = pd.to_datetime(total_data['deadline'], format = "%Y/%m/%d").dt.date
total_data['deadline'] = pd.to_datetime(total_data['deadline'], format='%Y-%m-%d %H:%M:%S')

#total_data['launched'] = pd.to_datetime(total_data['launched'], format = "%Y/%m/%d").dt.date
total_data['launched'] = pd.to_datetime(total_data['launched'], format='%Y-%m-%d %H:%M:%S')

# deadline - launched 를 계산하여 지속 기간 계산
total_data['duration_days'] = (total_data['deadline'] - total_data['launched']).dt.days  

data_copy = total_data.copy() # 데이터 원본 복제

# eda를 위한 데이터프레임 생성
df = data_copy[~(data_copy.duration_days > 120)] # eda를 위한 데이터프레임 생성

usd_goal = df.usd_goal_real
usd_goal = pd.Series(usd_goal)
usd_goal = pd.to_numeric(usd_goal)
usd_goal = pd.DataFrame(usd_goal).astype(int)
df.usd_goal_real = usd_goal# goal을 완전한 숫자형으로 변환후 치환

backers = df.backers
backers = pd.to_numeric(backers)
backers = pd.DataFrame(backers).astype(int)
df.backers = backers

usd_pledged_real = df.usd_pledged_real
usd_pledged_real = pd.to_numeric(usd_pledged_real)
usd_pledged_real = pd.DataFrame(usd_pledged_real).astype(int)
df.usd_pledged_real = usd_pledged_real

#feature 값들을 log scale로 변환
df['log_backers'] = np.log1p(df.backers)
df['log_usd_goal'] = np.log1p(df.usd_goal_real)
df['log_usd_pledged'] = np.log1p(df.usd_pledged_real)

#state는 successful, failed, canceled만 보겠다.
#df_projects = df[(df['state'] == 'failed') | (df['state'] == 'successful') | (df['state'] == 'canceled')]
df_projects = df[(df['state'] == 'failed') | (df['state'] == 'successful')]

#각 state를 숫자로 변환
df_projects["state"] = df_projects["state"].astype('category')
df_projects["state"] = df_projects["state"].cat.codes
#canceled = 0, failed = 1, successful = 2

## creating and refining our features

In [4]:
#goal이 0인 project의 data는 삭제한다
(df.usd_goal_real == 0).value_counts()

False    371234
True         37
Name: usd_goal_real, dtype: int64

In [5]:
(df_projects[df_projects['backers'] <= 1].state == 1).value_counts()
#후원자가 0 ~ 1명인데 성공한 프로젝트 개수가 333개임 
#-> 자신이 등록하고 자기가 후원해서 성공시킨 프로젝트일 수 있으므로 삭제

False    67305
True       335
Name: state, dtype: int64

In [6]:
(df_projects['duration_days'] == 0).value_counts()
#duration days가 0인 data 지운다

False    328347
True         92
Name: duration_days, dtype: int64

In [7]:
#위에서 이상하다고 판단한 data 지우기

df_projects = df_projects[(df_projects.usd_goal_real >= 1)]
#37개의 data 지운다

df_projects = df_projects[ ~( (df_projects['backers'] <= 1) & (df_projects['state'] == 1) ) ]
#333개의 project data 지운다

df_projects = df_projects[(df_projects.duration_days >= 1)]
#84개 지움

In [8]:
# name에 대한 변수들 생성한다
# 프로젝트 이름이 얼마나 긴지, 단어 수는 몇개인지도 성공에 영향을 미치지 않을까 하는 생각에서 변수 생성했다
df_projects['name_len'] = df_projects.name.str.len()
df_projects['name_words'] = df_projects.name.apply(lambda x: len(str(x).split(' ')))

In [9]:
# 각 프로젝트의 lanched 변수를 통해 몇 분기, 몇 월, 몇 년도에 시작한 프로젝트인지 알 수 있도록 변수생성
df_projects['launched_quarter']= df_projects['launched'].dt.quarter
df_projects['launched_month']= df_projects['launched'].dt.month
df_projects['launched_year']= df_projects['launched'].dt.year
df_projects['launched_week']= df_projects['launched'].dt.week

In [10]:
#goal, backers, pledged로 부터 추가적인 변수 생성

#나눌때 0으로 나누지 않도록 backers = 0인 것은 1로 바꿈(backers 1이하인데 성공인 것은 이미 지웠다.)
df_projects.loc[df_projects['backers'] == 0, 'backers'] = 1

#1명의 후원자 당 후원금액을 추가 변수로 생성
df_projects.loc[:,'pledge_per_backer'] = df_projects['usd_pledged_real'] / df_projects['backers']

#프로젝트에 대해 설정한 goal을 얼마나 달성했느냐 에 대한 변수 생성
df_projects.loc[:,'goal_achieve_rate'] = df_projects['usd_pledged_real'] / df_projects['usd_goal_real']

In [11]:
#will create percentile buckets for the goal amount in a category
#카테고리를 goal
df_projects['goal_cat_perc'] =  df_projects.groupby(['category'])['usd_goal_real'].transform(
                     lambda x: pd.qcut(x, [0, .35, .70, 1.0], labels =[1,2,3]))

#will create percentile buckets for the duration in a category
df_projects['duration_cat_perc'] =  df_projects.groupby(['category'])['duration_days'].transform(
                     lambda x: pd.qcut(x, [0, .35, .70, 1.0], labels =False, duplicates='drop'))

In [12]:
#creating a metric to see number of competitors for a given project in a given quarter
#number of participants in a given category, that launched in the same year and quarter and in the same goal bucket
ks_particpants_qtr=df_projects.groupby(['category','launched_year','launched_quarter','goal_cat_perc']).count()
ks_particpants_qtr=ks_particpants_qtr[['name']]
#since the above table has all group by columns created as index, converting them into columns
ks_particpants_qtr.reset_index(inplace=True)

#creating a metric to see number of competitors for a given project in a given month
#number of participants in a given category, that launched in the same year and month and in the same goal bucket
ks_particpants_mth=df_projects.groupby(['category','launched_year','launched_month','goal_cat_perc']).count()
ks_particpants_mth=ks_particpants_mth[['name']]
#since the above table has all group by columns created as index, converting them into columns
ks_particpants_mth.reset_index(inplace=True)

#creating a metric to see number of competitors for a given project in a given week
#number of participants in a given category, that launched in the same year and week and in the same goal bucket
ks_particpants_wk=df_projects.groupby(['category','launched_year','launched_week','goal_cat_perc']).count()
ks_particpants_wk=ks_particpants_wk[['name']]
#since the above table has all group by columns created as index, converting them into columns
ks_particpants_wk.reset_index(inplace=True)

In [13]:
#renaming columns of the derived table
colmns_qtr=['category', 'launched_year', 'launched_quarter', 'goal_cat_perc', 'participants_qtr']
ks_particpants_qtr.columns=colmns_qtr

colmns_mth=['category', 'launched_year', 'launched_month', 'goal_cat_perc', 'participants_mth']
ks_particpants_mth.columns=colmns_mth

colmns_wk=['category', 'launched_year', 'launched_week', 'goal_cat_perc', 'participants_wk']
ks_particpants_wk.columns=colmns_wk

In [14]:
#merging the particpants column into the base table
df_projects = pd.merge(df_projects, ks_particpants_qtr, on = ['category', 'launched_year', 'launched_quarter','goal_cat_perc'], how = 'left')
df_projects = pd.merge(df_projects, ks_particpants_mth, on = ['category', 'launched_year', 'launched_month','goal_cat_perc'], how = 'left')
df_projects = pd.merge(df_projects, ks_particpants_wk, on = ['category', 'launched_year', 'launched_week','goal_cat_perc'], how = 'left')

In [15]:
#creating 2 metrics to get average pledge per backer for a category in a year according to the goal bucket it lies in and the success rate ie average pledged to goal ratio for the category and goal bucket in this year
#using pledge_per_backer (computed earlier) and averaging it by category in a launch year
ks_ppb_goal=pd.DataFrame(df_projects.groupby(['category','launched_year','goal_cat_perc'])['pledge_per_backer','goal_achieve_rate'].mean())
#since the above table has all group by columns created as index, converting them into columns
ks_ppb_goal.reset_index(inplace=True)
#renaming column
ks_ppb_goal.columns= ['category','launched_year','goal_cat_perc','avg_ppb_goal','avg_success_rate_goal']

#creating a metric: the success rate ie average pledged to goal ratio for the category in this year
ks_ppb_duration=pd.DataFrame(df_projects.groupby(['category','launched_year','duration_cat_perc'])['goal_achieve_rate'].mean())
#since the above table has all group by columns created as index, converting them into columns
ks_ppb_duration.reset_index(inplace=True)
#renaming column
ks_ppb_duration.columns= ['category','launched_year','duration_cat_perc','avg_success_rate_duration']

In [16]:
#merging the particpants column into the base table
df_projects = pd.merge(df_projects, ks_ppb_goal, on = ['category', 'launched_year','goal_cat_perc'], how = 'left')
df_projects = pd.merge(df_projects, ks_ppb_duration, on = ['category', 'launched_year','duration_cat_perc'], how = 'left')

In [17]:
#creating 2 metrics: mean and median goal amount
median_goal_cat=pd.DataFrame(df_projects.groupby(['category','launched_year','duration_cat_perc'])['usd_goal_real'].median())
#since the above table has all group by columns created as index, converting them into columns
median_goal_cat.reset_index(inplace=True)
#renaming column
median_goal_cat.columns= ['category','launched_year','duration_cat_perc','median_goal_year']

mean_goal_cat=pd.DataFrame(df_projects.groupby(['category','launched_year','duration_cat_perc'])['usd_goal_real'].mean())
#since the above table has all group by columns created as index, converting them into columns
mean_goal_cat.reset_index(inplace=True)
#renaming column
mean_goal_cat.columns= ['category','launched_year','duration_cat_perc','mean_goal_year']

In [18]:
#merging the particpants column into the base table
df_projects = pd.merge(df_projects, median_goal_cat, on = ['category', 'launched_year','duration_cat_perc'], how = 'left')
df_projects = pd.merge(df_projects, mean_goal_cat, on = ['category', 'launched_year','duration_cat_perc'], how = 'left')

In [19]:
# replacing all 'N,0"' values in the country column with 'NZERO' to avoid discrepancies while one hot encoding
df_projects = df_projects.replace({'country': 'N,0"'}, {'country': 'NZERO'}, regex=True)

In [20]:
#selecting the needed fields only
#this will lead to the final features list

#creating a list of columns to be dropped
drop_columns= ['name','launched','deadline','backers','usd_pledged_real','pledge_per_backer','goal_achieve_rate', 'log_backers', 'log_usd_pledged']
#dropping columns above
df_projects.drop(drop_columns, axis=1, inplace=True)

In [21]:
#these functions will be used on the textual column entries to remove '&','-' or white spaces
def replace_ampersand(val):
    if isinstance(val, str):
        return(val.replace('&', 'and'))
    else:
        return(val)

def replace_hyphen(val):
    if isinstance(val, str):
        return(val.replace('-', '_'))
    else:
        return(val)    
    
def remove_extraspace(val):
        if isinstance(val, str):
            return(val.strip())
        else:
            return(val) 

def replace_space(val):
        if isinstance(val, str):
            return(val.replace(' ', '_'))
        else:
            return(val)   

In [22]:
#apply those functions to all cat columns
#this will remove special characters from the character columns.
#Since these fields will be one-hot encoded, the column names so derived should be compatible with the requied format
df_projects['category'] = df_projects['category'].apply(remove_extraspace)
df_projects['category'] = df_projects['category'].apply(replace_ampersand)
df_projects['category'] = df_projects['category'].apply(replace_hyphen)
df_projects['category'] = df_projects['category'].apply(replace_space)

df_projects['main_category'] = df_projects['main_category'].apply(remove_extraspace)
df_projects['main_category'] = df_projects['main_category'].apply(replace_ampersand)
df_projects['main_category'] = df_projects['main_category'].apply(replace_hyphen)
df_projects['main_category'] = df_projects['main_category'].apply(replace_space)

In [23]:
list(df_projects)

['category',
 'main_category',
 'currency',
 'state',
 'country',
 'usd_goal_real',
 'duration_days',
 'log_usd_goal',
 'name_len',
 'name_words',
 'launched_quarter',
 'launched_month',
 'launched_year',
 'launched_week',
 'goal_cat_perc',
 'duration_cat_perc',
 'participants_qtr',
 'participants_mth',
 'participants_wk',
 'avg_ppb_goal',
 'avg_success_rate_goal',
 'avg_success_rate_duration',
 'median_goal_year',
 'mean_goal_year']

In [24]:
#creating a backup copy of the dataset
df_projects_copied= df_projects.copy()

df_projects_copied[:5]

Unnamed: 0,category,main_category,currency,state,country,usd_goal_real,duration_days,log_usd_goal,name_len,name_words,...,goal_cat_perc,duration_cat_perc,participants_qtr,participants_mth,participants_wk,avg_ppb_goal,avg_success_rate_goal,avg_success_rate_duration,median_goal_year,mean_goal_year
0,Poetry,Publishing,GBP,0,GB,1533,58,7.335634,31,6,...,2,2,16.0,5.0,2.0,31.546311,0.470729,0.424653,2000.0,14978.409836
1,Narrative_Film,Film_and_Video,USD,0,US,30000,59,10.308986,45,8,...,3,2,9.0,3.0,2.0,144.423276,0.575384,0.571198,8517.5,76431.65
2,Narrative_Film,Film_and_Video,USD,0,US,45000,44,10.71444,14,3,...,3,2,78.0,28.0,3.0,159.083334,0.327343,0.533343,25000.0,364115.292135
3,Music,Music,USD,0,US,5000,29,8.517393,49,7,...,2,0,289.0,111.0,24.0,67.586157,0.796357,1.816775,3500.0,7125.419309
4,Restaurants,Food,USD,1,US,50000,34,10.819798,20,3,...,2,1,57.0,17.0,3.0,87.059456,0.225394,0.488309,20000.0,44310.723404


In [25]:
for c in df_projects.columns:
    col_type = df_projects[c].dtype
    if col_type == 'object' :
        a=df_projects[c].unique()
        keys= range(a.shape[0])
        diction={}
        for idx,val in enumerate(a):
            diction[idx] = a[idx]
        diction = {v: k for k, v in diction.items()}
        print(diction)
        df_projects_copied[c] = [diction[item] for item in df_projects_copied[c]] 
        df_projects_copied[c] = df_projects_copied[c].astype('category')

{'Poetry': 0, 'Narrative_Film': 1, 'Music': 2, 'Restaurants': 3, 'Food': 4, 'Drinks': 5, 'Nonfiction': 6, 'Indie_Rock': 7, 'Crafts': 8, 'Games': 9, 'Tabletop_Games': 10, 'Design': 11, 'Comic_Books': 12, 'Art_Books': 13, 'Fashion': 14, 'Childrenswear': 15, 'Theater': 16, 'Comics': 17, 'DIY': 18, 'Webseries': 19, 'Animation': 20, 'Food_Trucks': 21, 'Product_Design': 22, 'Public_Art': 23, 'Documentary': 24, 'Illustration': 25, 'Photography': 26, 'Pop': 27, 'People': 28, 'Art': 29, 'Family': 30, 'Fiction': 31, 'Film_and_Video': 32, 'Accessories': 33, 'Rock': 34, 'Hardware': 35, 'Software': 36, 'Weaving': 37, 'Web': 38, 'Jazz': 39, 'Ready_to_wear': 40, 'Festivals': 41, 'Video_Games': 42, 'Anthologies': 43, 'Publishing': 44, 'Shorts': 45, 'Gadgets': 46, 'Electronic_Music': 47, 'Radio_and_Podcasts': 48, 'Cookbooks': 49, 'Apparel': 50, 'Metal': 51, 'Comedy': 52, 'Hip_Hop': 53, 'Periodicals': 54, 'Dance': 55, 'Technology': 56, 'Painting': 57, 'World_Music': 58, 'Photobooks': 59, 'Drama': 60, 'A

In [26]:
# One-Hot encoding to convert categorical columns to numeric
print('start one-hot encoding')

df_projects_ip = pd.get_dummies(df_projects, prefix = [ 'category', 'main_category', 'currency','country'],
                             columns = [ 'category', 'main_category', 'currency','country'])
    
#this will have created 1-0 flag columns (like a sparse matrix)
print('ADS dummy columns made')

start one-hot encoding
ADS dummy columns made


In [27]:
#creating 2 arrays: features and response

#features will have all independent variables
features=list(df_projects_ip)
features.remove('state')
#response has the target variable
y_state= ['state']

In [28]:
#creating a backup copy of the input dataset
df_projects_ip_copy= df_projects_ip.copy()

In [29]:
df_projects_ip[:5]

Unnamed: 0,state,usd_goal_real,duration_days,log_usd_goal,name_len,name_words,launched_quarter,launched_month,launched_year,launched_week,...,country_IT,country_JP,country_LU,country_MX,country_NL,country_NO,country_NZ,country_SE,country_SG,country_US
0,0,1533,58,7.335634,31,6,3,8,2015,33,...,0,0,0,0,0,0,0,0,0,0
1,0,30000,59,10.308986,45,8,3,9,2017,35,...,0,0,0,0,0,0,0,0,0,1
2,0,45000,44,10.71444,14,3,1,1,2013,2,...,0,0,0,0,0,0,0,0,0,1
3,0,5000,29,8.517393,49,7,1,3,2012,11,...,0,0,0,0,0,0,0,0,0,1
4,1,50000,34,10.819798,20,3,1,2,2016,8,...,0,0,0,0,0,0,0,0,0,1


In [30]:
from sklearn import preprocessing

# 각 피쳐의 값을 normalize 한다. usd_goal_real의 경우 값이 10000단위로 존재하는데 다른 변수 중 10미만의 값을
#가진 변수가 많다. 분류 모델에 적용할 input data는 정규화된 data이어야 한다.
df_projects_ip_scaled_ftrs = pd.DataFrame(preprocessing.normalize(df_projects_ip[features]))
df_projects_ip_scaled_ftrs.columns=list(df_projects_ip[features])

In [31]:
df_projects_ip_scaled_ftrs[:3]

Unnamed: 0,usd_goal_real,duration_days,log_usd_goal,name_len,name_words,launched_quarter,launched_month,launched_year,launched_week,goal_cat_perc,...,country_IT,country_JP,country_LU,country_MX,country_NL,country_NO,country_NZ,country_SE,country_SG,country_US
0,0.100051,0.003785,0.000479,0.002023,0.000392,0.000196,0.000522,0.131508,0.002154,0.000131,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.363311,0.000715,0.000125,0.000545,9.7e-05,3.6e-05,0.000109,0.024427,0.000424,3.6e-05,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.2e-05
2,0.122368,0.00012,2.9e-05,3.8e-05,8e-06,3e-06,3e-06,0.005474,5e-06,8e-06,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3e-06


In [32]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
from sklearn import preprocessing
from sklearn import model_selection
import math

In [33]:
#creating test and train dependent and independent variables
#Split the data into test and train (30-70: random sampling)
#will be using the scaled dataset to split 
train_x, test_x, train_y, test_y = train_test_split(df_projects_ip_scaled_ftrs, df_projects_ip[y_state], test_size=0.3, random_state=0)

In [34]:
features_count = train_x.shape[1]

parameters_rf = {'n_estimators':[50], 'max_depth':[20], 'max_features': 
                     [math.floor(np.sqrt(features_count)), math.floor(features_count/3)]}

def random_forest_classifier(features, target):
    """
    To train the random forest classifier with features and target data
    :param features:
    :param target:
    :return: trained random forest classifier
    """
    clf = RandomForestClassifier(n_estimators=50,criterion='gini' ,max_depth=20, max_features=2)
    clf.fit(features, target)
    return clf

In [35]:
trained_model_RF= random_forest_classifier(train_x[features], train_y[y_state])

  


In [36]:
# Predict the on the train_data
test_x["Pred_state_RF"] = trained_model_RF.predict(test_x[features])

# Predict the on the train_data
train_x["Pred_state_RF"] = trained_model_RF.predict(train_x[features])

# Predict the on the train_data
df_projects_ip["Pred_state_RF"] = trained_model_RF.predict(df_projects_ip_scaled_ftrs)

In [37]:
# Train and Test Accuracy
print ("Train Accuracy :: ", accuracy_score(train_y[y_state], trained_model_RF.predict(train_x[features])))
print ("Test Accuracy  :: ", accuracy_score(test_y[y_state], trained_model_RF.predict(test_x[features])))
print ("Complete Accuracy  :: ", accuracy_score(df_projects_ip[y_state], trained_model_RF.predict(df_projects_ip_scaled_ftrs)))
print (" Confusion matrix of complete data is", confusion_matrix(df_projects_ip[y_state],df_projects_ip["Pred_state_RF"]))

Train Accuracy ::  0.6888011986271538
Test Accuracy  ::  0.6555351172268011
Complete Accuracy  ::  0.678821323495785
 Confusion matrix of complete data is [[178198  17468]
 [ 87877  44452]]


In [155]:
## Feature importances
ftr_imp_rf=zip(features,trained_model_RF.feature_importances_)
for values in ftr_imp_rf:
    print(values)

('usd_goal_real', 0.026451039219868747)
('duration_days', 0.03739312119896577)
('log_usd_goal', 0.03573033350742481)
('name_len', 0.05725710280631455)
('name_words', 0.04313366730288019)
('launched_quarter', 0.020699201923999353)
('launched_month', 0.02674123513501449)
('launched_year', 0.04133427765127915)
('launched_week', 0.018548889417573645)
('goal_cat_perc', 0.015296723869199504)
('duration_cat_perc', 0.009152935133932269)
('participants_qtr', 0.043135119460748814)
('participants_mth', 0.03670777847869208)
('participants_wk', 0.03888409874839494)
('avg_ppb_goal', 0.04218731510884563)
('avg_success_rate_goal', 0.07016489244922992)
('avg_success_rate_duration', 0.05176906464212117)
('median_goal_year', 0.047709107072692325)
('mean_goal_year', 0.026858088718907193)
('category_3D_Printing', 0.00014159378160141784)
('category_Academic', 0.00018661318777005157)
('category_Accessories', 0.00046209217935960653)
('category_Action', 0.00014606200949600817)
('category_Animals', 0.0001721468

In [156]:
feature_imp_RF=pd.DataFrame(list(zip(features,trained_model_RF.feature_importances_)))
column_names_RF= ['features','RF_imp']
feature_imp_RF.columns= column_names_RF

In [157]:
feature_imp_RF= feature_imp_RF.sort_values('RF_imp',ascending=False)
feature_imp_RF[:15]

Unnamed: 0,features,RF_imp
15,avg_success_rate_goal,0.070165
3,name_len,0.057257
16,avg_success_rate_duration,0.051769
17,median_goal_year,0.047709
11,participants_qtr,0.043135
4,name_words,0.043134
14,avg_ppb_goal,0.042187
7,launched_year,0.041334
13,participants_wk,0.038884
1,duration_days,0.037393


In [158]:
import lightgbm as lgb

In [159]:
#create LGBM classifier model
gbm_model = lgb.LGBMClassifier(
        boosting_type= "dart",
        n_estimators=1300,
        learning_rate=0.08,
        num_leaves=35,
        colsample_bytree=.8,
        subsample=.9,
        max_depth=9,
        reg_alpha=.1,
        reg_lambda=.1,
        min_split_gain=.01
)

# LGBM with one-hot encoded features
#fit the model on training data
gbm_model=gbm_model.fit(train_x[features], 
            train_y[y_state], 
              verbose=0)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [160]:
# Predict the on the train_data
test_x["Pred_state_LGB"] = gbm_model.predict(test_x[features])

# Predict the on the train_data
train_x["Pred_state_LGB"] = gbm_model.predict(train_x[features])

# Predict the on the train_data
df_projects_ip["Pred_state_LGB"] = gbm_model.predict(df_projects_ip_scaled_ftrs)

  if diff:
  if diff:
  if diff:


In [161]:
# Train and Test Accuracy
print ("Train Accuracy :: ", accuracy_score(train_y[y_state], gbm_model.predict(train_x[features])))
print ("Test Accuracy  :: ", accuracy_score(test_y[y_state], gbm_model.predict(test_x[features])))
print ("Complete Accuracy  :: ", accuracy_score(df_projects_ip[y_state], gbm_model.predict(df_projects_ip_scaled_ftrs)))
print (" Confusion matrix of complete data is", confusion_matrix(df_projects_ip[y_state],df_projects_ip["Pred_state_LGB"]))

  if diff:


Train Accuracy ::  0.721236432690465


  if diff:


Test Accuracy  ::  0.6933505421803067


  if diff:


Complete Accuracy  ::  0.71287062302779
 Confusion matrix of complete data is [[159214  36452]
 [ 57725  74604]]


In [162]:
# classification matrix
print('\nClassification metrics')
print(classification_report(y_true=test_y[y_state], y_pred=test_x["Pred_state_LGB"]))



Classification metrics
             precision    recall  f1-score   support

          0       0.72      0.80      0.76     58470
          1       0.65      0.54      0.59     39929

avg / total       0.69      0.69      0.69     98399



In [163]:
## Feature importances
ftr_imp_lgb=zip(features,gbm_model.feature_importances_)

for values in ftr_imp_lgb:
    print(values)

('usd_goal_real', 2368)
('duration_days', 4531)
('log_usd_goal', 1393)
('name_len', 1832)
('name_words', 2183)
('launched_quarter', 683)
('launched_month', 607)
('launched_year', 694)
('launched_week', 720)
('goal_cat_perc', 441)
('duration_cat_perc', 1386)
('participants_qtr', 1514)
('participants_mth', 992)
('participants_wk', 979)
('avg_ppb_goal', 2051)
('avg_success_rate_goal', 3254)
('avg_success_rate_duration', 2133)
('median_goal_year', 1369)
('mean_goal_year', 1255)
('category_3D_Printing', 0)
('category_Academic', 6)
('category_Accessories', 42)
('category_Action', 33)
('category_Animals', 1)
('category_Animation', 37)
('category_Anthologies', 42)
('category_Apparel', 167)
('category_Apps', 109)
('category_Architecture', 18)
('category_Art', 107)
('category_Art_Books', 28)
('category_Audio', 11)
('category_Bacon', 0)
('category_Blues', 0)
('category_Calendars', 23)
('category_Camera_Equipment', 14)
('category_Candles', 3)
('category_Ceramics', 0)
("category_Children's_Books", 

In [164]:
feature_imp_lgb=pd.DataFrame(list(zip(features,gbm_model.feature_importances_)))
column_names_lgb= ['features','LGB_imp']
feature_imp_lgb.columns= column_names_lgb

feature_imp_lgb= feature_imp_lgb.sort_values('LGB_imp',ascending=False)
feature_imp_lgb

Unnamed: 0,features,LGB_imp
1,duration_days,4531
15,avg_success_rate_goal,3254
0,usd_goal_real,2368
4,name_words,2183
16,avg_success_rate_duration,2133
14,avg_ppb_goal,2051
3,name_len,1832
11,participants_qtr,1514
2,log_usd_goal,1393
10,duration_cat_perc,1386


In [165]:
df_projects_copied.goal_cat_perc = df_projects_copied.goal_cat_perc.astype('float32')

In [None]:
#creating features and response list
features_2=list(df_projects_copied)
features_2.remove('state')
features_2_numerical = [e for e in features_2 if e not in ('category','main_category','country','currency')]
features_2_categorical = ['category','main_category','country','currency']
response = ['state']
# Assuming same lines from your example
cols_to_norm = features_2_numerical
df_projects_copied[cols_to_norm] = df_projects_copied[cols_to_norm].apply(lambda x: (x - x.min()) / (x.max() - x.min()))
#creating test and train dependent and independent variables
#Split the data into test and train (30-70: random sampling)
#will be using the scaled dataset to split 
train_ind_2, test_ind_2, train_dep_2, test_dep_2 = train_test_split(df_projects_copied[features_2],df_projects_copied[response], test_size=0.3, random_state=0)
#create LGBM classifier model
gbm_model_2 = lgb.LGBMClassifier(
        boosting_type= "dart",
        n_estimators=1500,
        learning_rate=0.05,
        num_leaves=38,
        colsample_bytree=.8,
        subsample=.9,
        max_depth=9,
        reg_alpha=.1,
        reg_lambda=.1,
        min_split_gain=.01
)

# LGBM with one-hot encoded features
#fit the model on training data
gbm_model_2=gbm_model_2.fit(train_ind_2[features_2], 
            train_dep_2[response], 
            feature_name=features_2,
            categorical_feature= features_2_categorical,                
              verbose=0)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
New categorical_feature is ['category', 'country', 'currency', 'main_category']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


In [None]:
# Predict the on the train_data
test_ind_2["Pred_state_LGB"] = gbm_model_2.predict(test_ind_2[features_2])

# Predict the on the train_data
train_ind_2["Pred_state_LGB"] = gbm_model_2.predict(train_ind_2[features_2])

# Predict the on the train_data
df_projects_copied["Pred_state_LGB"] = gbm_model_2.predict(df_projects_copied[features_2])

# Train and Test Accuracy
print ("Train Accuracy :: ", accuracy_score(train_dep_2[response], gbm_model_2.predict(train_ind_2[features_2])))
print ("Test Accuracy  :: ", accuracy_score(test_dep_2[response], gbm_model_2.predict(test_ind_2[features_2])))
print ("Complete Accuracy  :: ", accuracy_score(df_projects_copied[response], gbm_model_2.predict(df_projects_copied[features_2])))
print (" Confusion matrix of complete data is", confusion_matrix(df_projects_copied[response],df_projects_copied["Pred_state_LGB"]))

print('\nClassification metrics')
print(classification_report(y_true=test_dep_2[response], y_pred=gbm_model_2.predict(test_ind_2[features_2])))

In [None]:
## Feature importances
ftr_imp_lgb_2=zip(features_2,gbm_model_2.feature_importances_)

for values in ftr_imp_lgb_2:
    print(values)

In [None]:
feature_imp_lgb_2=pd.DataFrame(list(zip(features_2,gbm_model_2.feature_importances_)))
column_names_lgb_2= ['features','LGB_imp_2']
feature_imp_lgb_2.columns= column_names_lgb_2

feature_imp_lgb_2= feature_imp_lgb_2.sort_values('LGB_imp_2',ascending=False)
feature_imp_lgb_2

In [73]:
class LGBMClassifier_GainFE(lgb.LGBMClassifier):
    @property
    def feature_importances_(self):
        if self._n_features is None:
            raise LGBMNotFittedError('No feature_importances found. Need to call fit beforehand.')
        return self.booster_.feature_importance(importance_type='gain')
# defining parameters
lgb_gain = LGBMClassifier_GainFE(boosting_type= "dart",
        n_estimators=1500,
        learning_rate=0.05,
        num_leaves=38,
        colsample_bytree=.8,
        subsample=.9,
        max_depth=9,
        reg_alpha=.1,
        reg_lambda=.1,
        min_split_gain=.01)
#fitting the model
lgb_gain.fit(train_ind_2[features_2], 
            train_dep_2[response], 
            feature_name=features_2,
            categorical_feature= features_2_categorical,                
              verbose=0)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
New categorical_feature is ['category', 'country', 'currency', 'main_category']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


LGBMClassifier_GainFE(boosting_type='dart', class_weight=None,
           colsample_bytree=0.8, importance_type='split',
           learning_rate=0.05, max_depth=9, min_child_samples=20,
           min_child_weight=0.001, min_split_gain=0.01, n_estimators=1500,
           n_jobs=-1, num_leaves=38, objective=None, random_state=None,
           reg_alpha=0.1, reg_lambda=0.1, silent=True, subsample=0.9,
           subsample_for_bin=200000, subsample_freq=0)

In [74]:
# Predict the on the train_data
test_ind_2["Pred_state_LGB_Gain"] = lgb_gain.predict(test_ind_2[features_2])

# Predict the on the train_data
train_ind_2["Pred_state_LGB_Gain"] = lgb_gain.predict(train_ind_2[features_2])

# Predict the on the train_data
df_projects_copied["Pred_state_LGB_Gain"] = lgb_gain.predict(df_projects_copied[features_2])

  if diff:
  if diff:
  if diff:


In [75]:
# Train and Test Accuracy
print ("Train Accuracy :: ", accuracy_score(train_dep_2[response], lgb_gain.predict(train_ind_2[features_2])))
print ("Test Accuracy  :: ", accuracy_score(test_dep_2[response], lgb_gain.predict(test_ind_2[features_2])))
print ("Complete Accuracy  :: ", accuracy_score(df_projects_copied[response], lgb_gain.predict(df_projects_copied[features_2])))
print (" Confusion matrix of complete data is", confusion_matrix(df_projects_copied[response],df_projects_copied["Pred_state_LGB_Gain"]))
print (".")

  if diff:


Train Accuracy ::  0.7263406528396437


  if diff:


Test Accuracy  ::  0.7016970484348991


  if diff:


Complete Accuracy  ::  0.7189475415027465
 Confusion matrix of complete data is [[158201  37522]
 [ 54779  77910]]


In [76]:
## Feature importances
ftr_imp_lgb_gain=zip(features_2,lgb_gain.feature_importances_)

for values in ftr_imp_lgb_gain:
    print(values)
    
# creating a dataframe to get top 15 features
ftr_imp_lgb_gain=pd.DataFrame(list(zip(features_2,lgb_gain.feature_importances_)))
column_names_lgb_gain= ['features','LGB_gain_imp']
ftr_imp_lgb_gain.columns= column_names_lgb_gain

ftr_imp_lgb_gain= ftr_imp_lgb_gain.sort_values('LGB_gain_imp',ascending=False)
ftr_imp_lgb_gain[:15]

('category', 575210.0459327698)
('main_category', 40441.560438632965)
('currency', 7292.616619586945)
('country', 49592.98641586304)
('usd_goal_real', 209559.47360658646)
('duration_days', 213522.1879386902)
('log_usd_goal', 59851.56570100784)
('name_len', 63472.83627843857)
('name_words', 74056.2430858612)
('launched_quarter', 1835.7367906570435)
('launched_month', 4986.485179901123)
('launched_year', 70614.66909790039)
('launched_week', 22563.971601486206)
('goal_cat_perc', 9299.729823589325)
('duration_cat_perc', 4735.508813858032)
('participants_qtr', 31986.143317222595)
('participants_mth', 15155.075054168701)
('participants_wk', 8736.523712158203)
('avg_ppb_goal', 27775.439732074738)
('avg_success_rate_goal', 494554.5424847603)
('avg_success_rate_duration', 80465.54461145401)
('median_goal_year', 6433.021738529205)
('mean_goal_year', 17454.838505744934)


Unnamed: 0,features,LGB_gain_imp
0,category,575210.045933
19,avg_success_rate_goal,494554.542485
5,duration_days,213522.187939
4,usd_goal_real,209559.473607
20,avg_success_rate_duration,80465.544611
8,name_words,74056.243086
11,launched_year,70614.669098
7,name_len,63472.836278
6,log_usd_goal,59851.565701
3,country,49592.986416
