#### Importownie bibliotek

In [1]:
import pandas as pd
import numpy as np

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor, RadiusNeighborsRegressor
from sklearn import svm
from sklearn.naive_bayes import GaussianNB
import xgboost as xgb

from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import MinMaxScaler,LabelEncoder
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score

from data import Data

#### Wczytanie csv przy pomocy klasy Data

In [2]:
data = Data()

#### Wyznaczenie mniejszego zbioru (5%) z całej bazy danych, przy pomocy metody dev_data w celu wstepnej analizy modeli

In [3]:
#dev_df_created - to nasze 5% zbioru

dev_df_created, rest_df_created, full_df_created = data.dev_data('pledged_in_usd', test_percent=0.05)

x_dev=dev_df_created.drop(['pledged_in_usd'], axis=1)
y_dev=dev_df_created['pledged_in_usd']

#### Reprezentacja x i y 

In [4]:
x_dev.head()

Unnamed: 0,main_cat_cat,country,duration,currency,goal_in_usd,percentage_of_money_collected,backers,state
134636,169,21,20,13,7500.0,1.2068,172,1
297403,145,0,10,0,765.81,1.624006,42,1
121577,0,21,30,13,400.0,1.075,8,1
174608,71,21,14,13,5000.0,0.028,5,0
57035,58,21,40,13,38000.0,0.004868,6,0


In [5]:
y_dev.head()

134636    9051.00
297403    1243.68
121577     430.00
174608     140.00
57035      185.00
Name: pledged_in_usd, dtype: float64

---
---

#### Funkcja, która przy pomocy GridSearch szuka najlepszych parametrów

In [6]:
def best_params(model, params, cv, x_train, y_train):
    best_model = GridSearchCV(model, params, cv=cv)
    best_model.fit(x_train, y_train)
    best_options = best_model.best_params_
    return best_options, best_model

#### Funkcja wyliczająca score z walidacji, r2 i mse dla najlepszych modeli

In [7]:
def prediction_reg(model, x_train, y_train, x_test, y_test, cv):
    y_pred = model.predict(x_test)
    
    score_val = np.mean(cross_val_score(model, x_train, y_train, cv=cv))
    score_r2 = r2_score(y_test, y_pred)
    score_mse = mean_squared_error(y_test, y_pred)
    return score_val, score_r2, score_mse

#### Funkcja potrzebna do KNeighborsRegressor (normalizuje x tak by były od 0 do 1)

In [8]:
def normalize_data(x_train, x_test):
    scaler = MinMaxScaler()
    scaler.fit(x_train)
    x_train_norm = scaler.transform(x_train)
    x_test_norm = scaler.transform(x_test)
    return x_train_norm, x_test_norm

#### Podział próbek na treningowe i testowe

In [9]:
x_train, x_test, y_train, y_test = train_test_split(x_dev, y_dev, test_size=0.2)

In [10]:
x_train_norm, x_test_norm = normalize_data(x_train, x_test)

---
---

#### Stworzenie modeli regresji (do uzupełnienia params dla wszytskich modeli)

In [11]:
# DecisionTreeRegressor
dtr_model = DecisionTreeRegressor()
dtr_params = {}

In [12]:
# KNeighborsRegressor
knr_model = KNeighborsRegressor()
knr_params = {}

In [13]:
# RandomForestRegressor
rfr_model = RandomForestRegressor()
rfr_params = {}

In [14]:
# SVR
svm_model = svm.SVR()
svm_params = {}

In [15]:
# Xgboost
xgb_model = xgb.XGBRegressor()
xgb_params = {}

#### Upakowanie modeli do słownika

In [16]:
models={'DecisionTreeRegressor': (dtr_model,dtr_params),
        'KNeighborsRegressor': (knr_model, knr_params),
        'RandomForestRegressor' : (rfr_model, rfr_params),
        'SVR' : (svm_model, svm_params),
        'Xgboost' : (xgb_model, xgb_params)}

#### Szukanie najlepszych parametrów

In [17]:
results=[]
for key in models.keys():
    if key=='KNeighborsRegressor':
        best_options, best_model = best_params(models[key][0], models[key][1], 3, x_train_norm, y_train)
        score_val, score_r2, score_mse = prediction_reg(best_model, x_train_norm, y_train, x_test_norm, y_test, cv=3)
    else:
        best_options, best_model = best_params(models[key][0], models[key][1], 3, x_train, y_train)
        score_val, score_r2, score_mse = prediction_reg(best_model, x_train, y_train, x_test, y_test, cv=3)
    
    results.append([key, best_options, score_val, score_r2, score_mse])
    print(key+' - done!')

DecisionTreeRegressor - done!
KNeighborsRegressor - done!
RandomForestRegressor - done!
SVR - done!


  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \


Xgboost - done!


#### Stworznie data frame z ostatecznymi wynikami

In [18]:
results_df = pd.DataFrame(results, columns=["model", "best_options", "validation", "r2", "mse"])
results_df

Unnamed: 0,model,best_options,validation,r2,mse
0,DecisionTreeRegressor,{},0.450026,0.706948,4312816000.0
1,KNeighborsRegressor,{},0.380758,0.566775,6375727000.0
2,RandomForestRegressor,{},0.508197,0.866194,1969205000.0
3,SVR,{},-0.020216,-0.007243,14823480000.0
4,Xgboost,{},0.641833,0.741807,3799801000.0
