In [1]:
import pandas as pd
import numpy as np
# pd.set_option()


pd.set_option('display.max_columns',50)

train = pd.read_csv('./Data/Yes_Bank_Train.csv')

print(train.shape,'\n')
print(train.info(),'\n')
print(train.describe().T,'\n')
print(train.head(20),'\n')
print(train.tail(20),'\n')

(17773, 56) 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17773 entries, 0 to 17772
Data columns (total 56 columns):
Serial Number                     17773 non-null int64
fund_symbol                       17773 non-null object
fund_name                         17773 non-null object
category                          17773 non-null object
fund_family                       17773 non-null object
investment                        16783 non-null object
size                              16783 non-null object
total_net_assets                  17773 non-null int64
currency                          17773 non-null object
net_annual_expenses_ratio         17761 non-null float64
morningstar_rating                17773 non-null int64
inception_date                    17773 non-null object
portfolio_cash                    17768 non-null float64
portfolio_stocks                  17768 non-null float64
portfolio_bonds                   17768 non-null float64
portfolio_others                  17

In [2]:
train = train[train['returns_ytd'].notnull()]

In [3]:
train = train[train['portfolio_bonds'].notnull()]


In [4]:
train['bonds_aaa'].isnull().sum()

0

In [5]:
train['bonds_aaa'].describe()

count    17654.000000
mean        12.512602
std         23.696518
min         -7.600000
25%          0.000000
50%          0.000000
75%         13.310000
max        104.170000
Name: bonds_aaa, dtype: float64

In [6]:
train['bonds_aa'].describe()

count    17654.000000
mean         4.548923
std         12.366100
min        -11.460000
25%          0.000000
50%          0.000000
75%          2.585000
max        177.170000
Name: bonds_aa, dtype: float64

In [7]:
train['net_annual_expenses_ratio'].describe()

count    17642.000000
mean         1.028833
std          0.623832
min          0.000000
25%          0.630000
50%          0.970000
75%          1.370000
max         19.100000
Name: net_annual_expenses_ratio, dtype: float64

In [8]:
train['bonds_aaa'].isnull().sum()

0

### 2 Data exploration and missing value treatment

In [9]:
import seaborn as sns
import numpy as np

In [10]:
train['investment'] = train['investment'].replace(np.nan, 'MISSING')

In [11]:
train['size'] = train['size'].replace(np.nan, 'MISSING_size')

In [12]:
from sklearn.preprocessing import Imputer

#### 2.1 Handelling the categorical variables -- Sum and investment

In [13]:
investment_encod = pd.get_dummies(train['investment'])
train = train.join(investment_encod)
# print(train.describe())

In [14]:
investment_encod = pd.get_dummies(train['size'])
train = train.join(investment_encod)
# print(train.describe())

In [15]:
train.isnull().sum()

Serial Number                        0
fund_symbol                          0
fund_name                            0
category                             0
fund_family                          0
investment                           0
size                                 0
total_net_assets                     0
currency                             0
net_annual_expenses_ratio           12
morningstar_rating                   0
inception_date                       0
portfolio_cash                       0
portfolio_stocks                     0
portfolio_bonds                      0
portfolio_others                     0
portfolio_preferred                  0
portfolio_convertable                0
sectors_basic_materials              0
sectors_consumer_cyclical            0
sectors_financial_services           0
sectors_real_estate                  0
sectors_consumer_defensive           0
sectors_healthcare                   0
sectors_utilities                    0
sectors_communication_ser

#### 2.2 Performing Imputations - for selected features


In [16]:
# choose variable mean_annula_3y or standard_deviation_3y with mean/median imputation
# choose sharpe_ratio_3y with mena or median imputation

mean_imputer = Imputer()
median_imputer = Imputer(strategy='median')
train['standard_deviation_3y']  = median_imputer.fit_transform(train[['standard_deviation_3y']]).ravel()
train['sharpe_ratio_3y']  = median_imputer.fit_transform(train[['sharpe_ratio_3y']]).ravel()

# use returns_15 and returns_17 with mean imputations


train['returns_2015']  = mean_imputer.fit_transform(train[['returns_2015']]).ravel()
train['returns_2017']  = mean_imputer.fit_transform(train[['returns_2017']]).ravel()

#### Selecting the feature to build the model

In [17]:
train_select = train[['total_net_assets','net_annual_expenses_ratio', 'morningstar_rating','portfolio_cash', 'portfolio_stocks', 'portfolio_bonds',
       'portfolio_others', 'portfolio_preferred', 'portfolio_convertable',
        'bonds_aaa',
       'sectors_basic_materials', 'sectors_consumer_cyclical',
       'sectors_financial_services', 'sectors_real_estate',
       'sectors_consumer_defensive', 'sectors_healthcare', 'sectors_utilities',
       'sectors_communication_services', 'sectors_energy',
       'sectors_industrials', 'sectors_technology', 'price_earning','returns_ytd', 'returns_2017',
       'returns_2015','morningstar_risk_rating',
        'standard_deviation_3y', 'sharpe_ratio_3y',
       'MISSING_size', 'Medium', 'Small']]
# 'Blend', 'Growth', 'MISSING', 'Value', 'Large',

#### Comparing different Regression algorithms

In [18]:
from sklearn.model_selection import train_test_split

target = train_select['bonds_aaa']
predictors = train_select.drop(['bonds_aaa'],axis=1)

In [19]:
predictors = median_imputer.fit_transform(predictors)

In [20]:
target = median_imputer.fit_transform(train[['bonds_aaa']]).ravel()

In [21]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [22]:
X_train, X_test, Y_train, Y_test = train_test_split(predictors, target, train_size=0.8,test_size=0.2, random_state=7)


In [23]:
from sklearn import linear_model, svm, tree, ensemble

models = [
    linear_model.LinearRegression(),
    linear_model.Ridge(),
    linear_model.Lasso(),
    linear_model.ElasticNet(),
    linear_model.BayesianRidge(),
#     linear_model.RANSACRegressor(),
    svm.LinearSVR(),
    svm.SVR(),
    tree.DecisionTreeRegressor(),
    tree.ExtraTreeRegressor(),
    ensemble.RandomForestRegressor(),
    ensemble.GradientBoostingRegressor()

]

In [24]:
for model in models:
    model.fit(predictors,target)
    print('-'*30)
    print(model.__class__.__name__)
    
    train_pred = model.predict(predictors)
    train_rmse = np.sqrt(mean_squared_error(target, train_pred))
    print("Root mean square error train {}".format(train_rmse))    

    pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(Y_test, pred))
    print("Root mean square error test {}".format(rmse))

------------------------------
LinearRegression
Root mean square error train 19.152215563350005
Root mean square error test 18.520094824362893
------------------------------
Ridge
Root mean square error train 19.152218941545783
Root mean square error test 18.519886909277528


Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number1.960428e-22
  overwrite_a=True).T


------------------------------
Lasso
Root mean square error train 20.468657699488862
Root mean square error test 19.626710657679723
------------------------------
ElasticNet
Root mean square error train 20.520837025374224
Root mean square error test 19.666556808849574
------------------------------
BayesianRidge
Root mean square error train 19.15244185027426
Root mean square error test 18.518579682420587
------------------------------
LinearSVR
Root mean square error train 761247.4590919425
Root mean square error test 629254.368469513
------------------------------
SVR
Root mean square error train 24.015482832611983
Root mean square error test 22.72567642408247
------------------------------
DecisionTreeRegressor
Root mean square error train 1.8000774049971332e-15
Root mean square error test 1.79417246503836e-15
------------------------------
ExtraTreeRegressor
Root mean square error train 1.661901196390451e-15
Root mean square error test 1.6806659122121832e-15
------------------------

### Getting the result for the test data set

In [25]:
test = pd.read_csv('./Data/Yes_Bank_Test_Data.csv')

In [26]:
test.isnull().sum()

Serial Number                        0
fund_symbol                          0
fund_name                            0
category                             0
fund_family                          0
investment                         560
size                               560
total_net_assets                     0
currency                             0
net_annual_expenses                  4
morningstar_rating                   0
inception_date                       0
portfolio_cash                       3
portfolio_stocks                     3
portfolio_bonds                      3
portfolio_others                     3
portfolio_preferred                  3
portfolio_convertable                3
sectors_basic_materials              3
sectors_consumer_cyclical            3
sectors_financial_services           3
sectors_real_estate                  3
sectors_consumer_defensive           3
sectors_healthcare                   3
sectors_utilities                    3
sectors_communication_ser

In [27]:
# test = test[test['returns_ytd'].notnull()]

# test = test[test['portfolio_bonds'].notnull()]
# test = test[test['net_annual_expenses'].notnull()]

test['returns_ytd'] = median_imputer.fit_transform(test[['returns_ytd']]).ravel()

test['portfolio_bonds'] = median_imputer.fit_transform(test[['portfolio_bonds']]).ravel()
test['net_annual_expenses'] = median_imputer.fit_transform(test[['net_annual_expenses']]).ravel()

In [28]:
test['investment'] = test['investment'].replace(np.nan, 'MISSING')

In [29]:
test['size'] = test['size'].replace(np.nan, 'MISSING_size')

In [30]:
investment_encod = pd.get_dummies(test['investment'])
test = test.join(investment_encod)

size_encod = pd.get_dummies(test['size'])
test = test.join(size_encod)

In [31]:
# choose variable mean_annula_3y or standard_deviation_3y with mean/median imputation
# choose sharpe_ratio_3y with mena or median imputation

mean_imputer = Imputer()
median_imputer = Imputer(strategy='median')
test['standard_deviation_3y']  = median_imputer.fit_transform(test[['standard_deviation_3y']]).ravel()
test['sharpe_ratio_3y']  = median_imputer.fit_transform(test[['sharpe_ratio_3y']]).ravel()

# use returns_15 and returns_17 with mean imputations


test['returns_2015']  = mean_imputer.fit_transform(test[['returns_2015']]).ravel()
test['returns_2017']  = mean_imputer.fit_transform(test[['returns_2017']]).ravel()

In [32]:
test['net_annual_expenses_ratio'] = test['net_annual_expenses']

In [33]:
test = test[['total_net_assets','net_annual_expenses_ratio', 'morningstar_rating','portfolio_cash', 'portfolio_stocks', 'portfolio_bonds',
       'portfolio_others', 'portfolio_preferred', 'portfolio_convertable',
       'sectors_basic_materials', 'sectors_consumer_cyclical',
       'sectors_financial_services', 'sectors_real_estate',
       'sectors_consumer_defensive', 'sectors_healthcare', 'sectors_utilities',
       'sectors_communication_services', 'sectors_energy',
       'sectors_industrials', 'sectors_technology', 'price_earning','returns_ytd', 'returns_2017',
       'returns_2015','morningstar_risk_rating',
        'standard_deviation_3y', 'sharpe_ratio_3y',
       'MISSING_size', 'Medium', 'Small']]

In [34]:
test  = mean_imputer.fit_transform(test)

In [35]:
test_pred = models[-3].predict(test)

In [36]:
test_sub = pd.read_csv('./Data/Yes_Bank_Test_Data.csv')

In [37]:
test_sub.shape

(7621, 48)

In [38]:
result = pd.concat([test_sub['Serial Number'],pd.Series(test_pred, name='bonds_aaa')], axis =1,)


In [39]:
pd.DataFrame.to_csv(resul``t,path_or_buf='./output/results_yes_3_4.csv',index=False)

In [4]:
import numpy as np

In [15]:
np.random.randint(0,10**5,10**6).tolist

array([89983, 81857, 36173, ..., 60411, 12197, 73829])

In [None]:
for _ in range(10**6):
    a.append(np.rand.random()

In [2]:
a = [2,12,1,3,4,1,3,72,4,6,2,45,73245,23,4]

In [3]:
%%time
sorted = False
while not sorted:
    for i in range(len(a)-1):
        if a[i] > a[i+1]:
            a[i], a[i+1] = a[i+1], a[i]
    else:
        sorted = True

Wall time: 0 ns
