## I. Import Libary

In [1]:
import numpy as np
import pandas as pd
import os
import pickle
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

## II. Function Convert

In [2]:
def convert(x):
    conn = {
        #state
        'failed': 0, 'successful': 1,
        #category
        '3D Printing': 1, 'Academic': 2, 'Accessories': 3, 'Action': 4, 'Animals': 5,
        'Animation': 6, 'Anthologies': 7, 'Apparel': 8, 'Apps': 9,
        'Architecture': 10, 'Art': 11, 'Art Books': 12, 'Audio': 13, 'Bacon': 14,
        'Blues': 15, 'Calendars': 16, 'Camera Equipment': 17, 'Candles': 18, 'Ceramics': 19,
        "Children's Books": 20, 'Childrenswear': 21, 'Chiptune': 22, 'Civic Design': 23, 'Classical Music': 24,
        'Comedy': 25, 'Comic Books': 26, 'Comics': 27, 'Community Gardens': 28, 'Conceptual Art': 29,
        'Cookbooks': 30, 'Country & Folk': 31, 'Couture': 32, 'Crafts': 33, 'Crochet': 34,
        'DIY': 35, 'DIY Electronics': 36, 'Dance': 37, 'Design': 38, 'Digital Art': 39,
        'Documentary': 40, 'Drama': 41, 'Drinks': 42, 'Electronic Music': 43, 'Embroidery': 44,        
        'Events': 45, 'Experimental': 46, 'Fabrication Tools': 47, 'Faith': 48, 'Family': 49,
        'Fantasy': 50, "Farmer's Markets": 51, 'Farms': 52, 'Fashion': 53, 'Festivals': 54,
        'Fiction': 55, 'Film & Video': 56, 'Fine Art': 57, 'Flight': 58, 'Food': 59,
        'Food Trucks': 60, 'Footwear': 61, 'Gadgets': 62, 'Games': 63, 'Gaming Hardware': 64,
        'Glass': 65, 'Graphic Design': 66, 'Graphic Novels': 67, 'Hardware': 68, 'Hip-Hop': 69,
        'Horror': 70, 'Illustration': 71, 'Immersive': 72, 'Indie Rock': 73, 'Installations': 74,
        'Interactive Design': 75, 'Jazz': 76, 'Jewelry': 77, 'Journalism': 78, 'Kids': 79,
        'Knitting': 80, 'Latin': 81, 'Letterpress': 82, 'Literary Journals': 83, 'Literary Spaces': 84,
        'Live Games': 85, 'Makerspaces': 86, 'Metal': 87, 'Mixed Media': 88, 'Mobile Games': 89,
        'Movie Theaters': 90, 'Music': 91, 'Music Videos': 92, 'Musical': 93, 'Narrative Film': 94,        
        'Nature': 95, 'Nonfiction': 96, 'Painting': 97, 'People': 98, 'Performance Art': 99,
        'Performances': 100, 'Periodicals': 101, 'Pet Fashion': 102, 'Photo': 103, 'Photobooks': 104,
        'Photography': 105, 'Places': 106, 'Playing Cards': 107, 'Plays': 108, 'Poetry': 109,
        'Pop': 110, 'Pottery': 111, 'Print': 112, 'Printing': 113, 'Product Design': 114,
        'Public Art': 115, 'Publishing': 116, 'Punk': 117, 'Puzzles': 118, 'Quilts': 119,
        'R&B': 120, 'Radio & Podcasts': 121, 'Ready-to-wear': 122, 'Residencies': 123, 'Restaurants': 124,
        'Robots': 125, 'Rock': 126, 'Romance': 127, 'Science Fiction': 128, 'Sculpture': 129,
        'Shorts': 130, 'Small Batch': 131, 'Software': 132, 'Sound': 133, 'Space Exploration': 134,
        'Spaces': 135, 'Stationery': 136, 'Tabletop Games': 137, 'Taxidermy': 138, 'Technology': 139,
        'Television': 140, 'Textiles': 141, 'Theater': 142, 'Thrillers': 143, 'Translations': 144,
        'Typography': 145, 'Vegan': 146, 'Video': 147, 'Video Art': 148, 'Video Games': 149,
        'Wearables': 150, 'Weaving': 151, 'Web': 152, 'Webcomics': 153, 'Webseries': 154,
        'Woodworking': 155, 'Workshops': 156, 'World Music': 157, 'Young Adult': 158, 'Zines': 159,
        #main category
        'Art': 1, 'Comics': 2, 'Crafts': 3, 'Dance': 4, 'Design': 5,
        'Fashion': 6, 'Film & Video': 7, 'Food': 8, 'Games': 9, 'Journalism': 10,
        'Music': 11, 'Photography': 12, 'Publishing': 13, 'Technology': 14, 'Theater': 15,        
        #country
        'AT': 1, 'AU': 2, 'BE': 3, 'CA': 4, 'CH': 5, 'DE': 6, 'DK': 7, 'ES': 8, 'FR': 9, 'GB': 10, 'HK': 11,'IE': 12, 'IT': 13,
        'JP': 14, 'LU': 15, 'MX': 16, 'NL': 17, 'NO': 18, 'NZ': 19, 'SE': 20, 'SG': 21, 'US': 22,
        #currency
        'AUD': 1, 'CAD': 2, 'CHF': 3, 'DKK': 4, 'EUR': 5, 'GBP': 6, 'HKD': 7, 'JPY': 8, 'MXN': 9,
        'NOK': 10, 'NZD': 11, 'SEK': 12, 'SGD': 13, 'USD': 14,
    }
    return conn.get(x, "nothing") 

## III. Get Data

In [3]:
data = pd.read_csv('http://git.aureole-it.vn/huynd/mydata/raw/master/kickstarter.csv', encoding='utf-8')

In [4]:
data['deadline'] = pd.to_datetime(data['deadline'])
data['launched'] = pd.to_datetime(data['launched'])
data['times'] = (data['deadline'] - data['launched']).dt.days
data['times'] = data['times'].astype('int64')

## IV. Clean Data

In [5]:
data.dropna(inplace=True)
data.drop(columns=['ID', 'name', 'launched',
                        'deadline', 'backers', 'pledged',
                        'usd pledged', 'usd_pledged_real', 'usd_goal_real'],
               inplace=True)
data['main_category'] = data['main_category'].apply(convert)
data['category'] = data['category'].apply(convert)
data['country'] = data['country'].apply(convert)
data['state'] = data['state'].apply(convert)
data['currency'] = data['currency'].apply(convert)

## V. Convert Data

In [6]:
model_data=data[data['state'] != 'nothing']

model_data['state'] = model_data['state'].astype('int64')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [7]:
model_data.head()

Unnamed: 0,category,main_category,currency,goal,state,country,times
0,109,13,6,1000.0,0,10,58
1,94,7,14,30000.0,0,22,59
2,94,7,14,45000.0,0,22,44
3,11,11,14,5000.0,0,22,29
5,124,8,14,50000.0,1,22,34


## VI. Build Model

In [8]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.pipeline import Pipeline

- **Split data**

In [9]:
#Split data
X = model_data.drop(columns=['state'])
y = model_data['state']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .20, random_state = 42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=.15, random_state=42)

### 1/ Random Forest

- **Import Libary**

In [14]:
from sklearn.ensemble import RandomForestClassifier

- **Build and Test RDF Model**

In [15]:
#Build model

rdf = Pipeline([("Scaler",
                 StandardScaler()),
                ("RandomForest",
                 RandomForestClassifier(n_estimators=200, n_jobs=-1, random_state=42))])
rdf.fit(X_train, y_train)
pre = rdf.predict(X_val)
#Test
accuracy_score(pre, y_val)

  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)


0.6532833869670153

- **Save RDF Model**

In [None]:
pickle.dump(rdf, open(os.path.join('RDF.pkl'), 'wb'), protocol=4)

### 2/ Decision Tree

- **Import Libary**

In [12]:
from sklearn.tree import DecisionTreeClassifier

- **Build and Test DTF Model**

In [13]:
#Build model
dtf = Pipeline([("Scaler",
                 StandardScaler()),
                ("DecisionTree",
                 DecisionTreeClassifier(random_state=42))])
dtf.fit(X_train, y_train)
pre = dtf.predict(X_val)
#Test
accuracy_score(pre, y_val)

  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)


0.6353831456154465

- **Save RDF Model**

In [None]:
pickle.dump(dtf, open(os.path.join('DTF.pkl'), 'wb'), protocol=4)

### 3/ Logistic Regression

- **Import Libary**

In [10]:
from sklearn.linear_model import LogisticRegression

- **Build and Test Logistic Regression Model**

In [11]:
#Build model
clf = LogisticRegression()
clf.fit(X_train, y_train)
pre = clf.predict(X_test)
#Test
accuracy_score(y_test, pre)



0.5991582821715716

- **Save Linear Regression Model**

In [None]:
pickle.dump(clf, open(os.path.join('CLF.pkl'), 'wb'), protocol=4)

### 4/ Gradient Boosting

- **Import Libary**

In [16]:
from sklearn.ensemble import GradientBoostingClassifier

- **Build and Test GBF Model**

In [17]:
#Build model
gbf = Pipeline([("Scaler",
                 StandardScaler()),
                ("GradientBoosting",
                 GradientBoostingClassifier(n_estimators=500,
                                            random_state=42))])
gbf.fit(X_train, y_train)
y_pre = gbf.predict(X_test)
#Test
accuracy_score(y_pre, y_test)

  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)


0.682349569336129

- **Save GBF Model**

In [11]:
pickle.dump(gbf, open(os.path.join('GBF.pkl'), 'wb'), protocol=4)