In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [196]:
# the function for calculating 'duration' and updating the dataframe
def df_tailoring(df):
    le = LabelEncoder()
    df['launched_date'] = df['launched_at'].map(lambda x: datetime.fromtimestamp(x))
    df['deadline_date'] = df['deadline'].map(lambda x: datetime.fromtimestamp(x))
    df['status_date'] = df['state_changed_at'].map(lambda x: datetime.fromtimestamp(x))
    df['duration'] = df['deadline_date'] - df['launched_date']
    df['duration_days'] = df['duration'].map(lambda x: x.days)
    
    # layback means the how long does the status change after the deadline
    df['layback'] = df['status_date'] - df['deadline_date']
    df['layback_minutes'] = df['layback'].map(lambda x: x.seconds/60)

    df['disable_dummy'] = le.fit_transform(df['disable_communication'])
    df['country_label'] = le.fit_transform(df['country'])

### Preprocessing train set

In [197]:
train_df = pd.read_csv('train.csv')
df_tailoring(train_df)
train_df.head()

Unnamed: 0,project_id,name,desc,goal,keywords,disable_communication,country,currency,deadline,state_changed_at,...,final_status,launched_date,deadline_date,status_date,duration,duration_days,layback,layback_minutes,disable_dummy,country_label
0,kkst1451568084,drawing for dollars,I like drawing pictures. and then i color them...,20.0,drawing-for-dollars,False,US,USD,1241333999,1241334017,...,1,2009-04-24 20:52:03,2009-05-03 07:59:59,2009-05-03 08:00:17,8 days 11:07:56,8,00:00:18,0.3,0,10
1,kkst1474482071,Sponsor Dereck Blackburn (Lostwars) Artist in ...,"I, Dereck Blackburn will be taking upon an inc...",300.0,sponsor-dereck-blackburn-lostwars-artist-in-re...,False,US,USD,1242429000,1242432018,...,0,2009-04-29 04:26:32,2009-05-16 00:10:00,2009-05-16 01:00:18,16 days 19:43:28,16,00:50:18,50.3,0,10
2,kkst183622197,Mr. Squiggles,So I saw darkpony's successfully funded drawin...,30.0,mr-squiggles,False,US,USD,1243027560,1243027818,...,0,2009-05-12 22:39:58,2009-05-22 22:26:00,2009-05-22 22:30:18,9 days 23:46:02,9,00:04:18,4.3,0,10
3,kkst597742710,Help me write my second novel.,Do your part to help out starving artists and ...,500.0,help-me-write-my-second-novel,False,US,USD,1243555740,1243556121,...,1,2009-04-29 01:58:50,2009-05-29 01:09:00,2009-05-29 01:15:21,29 days 23:10:10,29,00:06:21,6.35,0,10
4,kkst1913131122,Support casting my sculpture in bronze,"I'm nearing completion on a sculpture, current...",2000.0,support-casting-my-sculpture-in-bronze,False,US,USD,1243769880,1243770317,...,0,2009-05-01 13:22:21,2009-05-31 12:38:00,2009-05-31 12:45:17,29 days 23:15:39,29,00:07:17,7.283333,0,10


In [198]:
train_clean_df = pd.read_csv("train_cleaned.csv")
train_df['goal_usd'] = train_clean_df['goal_converted_usd']
train_df[['goal_usd','duration_days','layback_minutes','disable_dummy','country_label']]

Unnamed: 0,goal_usd,duration_days,layback_minutes,disable_dummy,country_label
0,23.41,8,0.300000,0,10
1,351.14,16,50.300000,0,10
2,35.11,9,4.300000,0,10
3,585.23,29,6.350000,0,10
4,2340.92,29,7.283333,0,10
...,...,...,...,...,...
108124,2118.89,30,0.050000,0,10
108125,15891.70,54,0.016667,0,10
108126,339.02,30,0.050000,0,10
108127,37080.63,30,0.066667,0,10


In [200]:
train_df['goal_per_day'] = train_df['goal_usd'] / train_df['duration_days']

In [201]:
train_df['goal_per_day']

0            2.926250
1           21.946250
2            3.901111
3           20.180345
4           80.721379
             ...     
108124      70.629667
108125     294.290741
108126      11.300667
108127    1236.021000
108128     588.581333
Name: goal_per_day, Length: 108129, dtype: float64

In [242]:
new_df = pd.read_csv('cleaned_train_raw.csv')
new_df.head()

Unnamed: 0.1,Unnamed: 0,project_id,name_len,desc_len,keywords_len,creation_to_launched,launched_to_deadline,goal_converted_usd,log_goal_currency_usd,desc_buzzword_count,...,kw_buzzword_count,year_launched_at,month_launched,project_yearly_count,goal_converted_usg_avg,goal_converted_usg_min,goal_converted_usg_max,disab_comm_label,month_launched_label,country_label
0,0,kkst1451568084,19,134,19,7.703459,13.502546,23.41,3.153163,0,...,0,2009,4,610,6688.779098,0.01,380399.28,0,3,10
1,1,kkst1474482071,76,137,74,9.640043,14.189422,351.14,5.861185,0,...,0,2009,4,610,6688.779098,0.01,380399.28,0,3,10
2,2,kkst183622197,13,385,12,6.665684,13.668358,35.11,3.558486,0,...,0,2009,5,610,6688.779098,0.01,380399.28,0,4,10
3,3,kkst597742710,30,131,29,7.984463,14.766786,585.23,6.372005,4,...,2,2009,4,610,6688.779098,0.01,380399.28,0,3,10
4,4,kkst1913131122,38,384,38,7.873598,14.766913,2340.92,7.758299,1,...,0,2009,5,610,6688.779098,0.01,380399.28,0,4,10


In [302]:
train_df['name_len'] = new_df['name_len']
train_df['desc_len'] = new_df['desc_len']
train_df['keywords_len'] = new_df['keywords_len']
train_df['project_yearly_count'] = new_df['project_yearly_count']
train_df['goal_converted_usg_avg'] = new_df['goal_converted_usg_avg']
train_df['goal_converted_usg_max'] = new_df['goal_converted_usg_max']

### Preparing train set and test set

In [296]:
X = train_df[['goal_per_day','disable_dummy','country_label','desc_len','project_yearly_count']]
y = train_df['final_status']

In [297]:
X = preprocessing.scale(X)

In [298]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)

### Running the regression

In [299]:
logmodel = LogisticRegression(class_weight='balanced')
logmodel.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

### Evaluating the result

In [300]:
predictions = logmodel.predict(X_test)
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.74      0.51      0.61     22208
           1       0.37      0.62      0.46     10231

    accuracy                           0.55     32439
   macro avg       0.56      0.57      0.53     32439
weighted avg       0.63      0.55      0.56     32439



In [228]:
export_df = train_df[['goal_per_day','layback_minutes','disable_dummy','country_label','final_status']]
export_df.to_csv('slim_train.csv')

In [279]:
confusion_matrix(y_test, predictions)

array([[11342, 10745],
       [ 3996,  6356]])