In [1]:
import pandas as pd
import numpy as np
from ggplot import *
import seaborn as sns
from collections import Counter
import matplotlib.pyplot as plt

%matplotlib inline

You can access Timestamp as pandas.Timestamp
  pd.tslib.Timestamp,


In [2]:
train_df = pd.read_csv("Yes_Bank_Train.csv",)
test_df = pd.read_csv("Yes_Bank_Test_Data.csv")
sample = pd.read_csv("Yes_Bank_sample_submission.csv")

In [3]:
print("Train Set Shape: {}".format(train_df.shape))
print("Test Set Shape: {}".format(test_df.shape))

Train Set Shape: (17773, 56)
Test Set Shape: (7621, 48)


# Removing Attributes with completly different or uniform distribution

In [4]:
train_df = train_df.drop(labels="fund_symbol", axis=1)
train_df = train_df.drop(labels="fund_name", axis=1)
train_df = train_df.drop(labels="currency", axis=1)

test_df = test_df.drop(labels="fund_symbol", axis=1)
test_df = test_df.drop(labels="fund_name", axis=1)
test_df = test_df.drop(labels="currency", axis=1)

# Data Cleaning [Categeorical Values]

In [5]:
nominal = train_df.describe().columns
all_attr = train_df.columns
categeory = [x for x in all_attr if x not in nominal]

In [6]:
categeory

['category', 'fund_family', 'investment', 'size', 'inception_date']

In [7]:
for x in categeory:
    print("{}: {}".format(x, len(set(train_df[x]))))

category: 108
fund_family: 587
investment: 4
size: 4
inception_date: 3418


In [8]:
print("Missing Values in Categorical Attributes")
for col in categeory:
    print("{}: {}".format(col, train_df[col].isna().sum()))

Missing Values in Categorical Attributes
category: 0
fund_family: 0
investment: 990
size: 990
inception_date: 0


In [9]:
# This has removed those records where categorical values were missed.

train_df = train_df[~train_df["investment"].isna()]

In [10]:
print("Missing Values in Categorical Attributes in Train DataSet")
for col in categeory:
    print("{}: {}".format(col, train_df[col].isna().sum()))

Missing Values in Categorical Attributes in Train DataSet
category: 0
fund_family: 0
investment: 0
size: 0
inception_date: 0


In [11]:
for x in categeory:
    print("{}: {}".format(x, len(set(test_df[x]))))

category: 110
fund_family: 312
investment: 4
size: 4
inception_date: 2110


In [12]:
print("Missing Values in Categorical Attributes in Test DataSet")
for col in categeory:
    print("{}: {}".format(col, test_df[col].isna().sum()))

Missing Values in Categorical Attributes in Test DataSet
category: 0
fund_family: 0
investment: 560
size: 560
inception_date: 0


In [13]:
test_df["size"] = [np.random.choice(list(set(test_df["size"]))[1:])  if test_df["size"][x] is np.nan  else test_df["size"][x] for x in range(test_df.shape[0])]

In [14]:
test_df["investment"] = [np.random.choice(list(set(test_df["investment"]))[1:])  if test_df["investment"][x] is np.nan  else test_df["investment"][x] for x in range(test_df.shape[0])]

# Data Cleaning [Nominal Values]

In [15]:
print("Missing Values in Nominal Attributes")
for col in nominal:
    print("{}: {}".format(col, train_df[col].isna().sum()))

Missing Values in Nominal Attributes
Serial Number: 0
total_net_assets: 0
net_annual_expenses_ratio: 12
morningstar_rating: 0
portfolio_cash: 4
portfolio_stocks: 4
portfolio_bonds: 4
portfolio_others: 4
portfolio_preferred: 4
portfolio_convertable: 4
sectors_basic_materials: 4
sectors_consumer_cyclical: 4
sectors_financial_services: 4
sectors_real_estate: 4
sectors_consumer_defensive: 4
sectors_healthcare: 4
sectors_utilities: 4
sectors_communication_services: 4
sectors_energy: 4
sectors_industrials: 4
sectors_technology: 4
price_earning: 4
bonds_us_government: 4
bonds_aaa: 4
bonds_aa: 4
bonds_a: 4
bonds_bbb: 4
bonds_bb: 4
bonds_b: 4
bonds_below_b: 4
bonds_others: 4
morningstar_return_rating: 0
returns_ytd: 93
returns_2017: 1715
returns_2016: 2664
returns_2015: 3769
returns_2014: 4808
returns_2013: 5870
returns_2012: 6881
returns_2011: 7668
returns_2010: 8440
morningstar_risk_rating: 0
alpha_3y: 1631
beta_3y: 1631
mean_annual_return_3y: 1631
standard_deviation_3y: 1631
sharpe_ratio_3y:

In [16]:
# This has removed those records where most of the values were null for a record.

train_df = train_df[~train_df["portfolio_cash"].isna()]
train_df = train_df[~train_df["net_annual_expenses_ratio"].isna()]

In [17]:
print("Missing Values in Nominal Attributes in Train Set")
for col in nominal:
    print("{}: {}".format(col, train_df[col].isna().sum()))

Missing Values in Nominal Attributes in Train Set
Serial Number: 0
total_net_assets: 0
net_annual_expenses_ratio: 0
morningstar_rating: 0
portfolio_cash: 0
portfolio_stocks: 0
portfolio_bonds: 0
portfolio_others: 0
portfolio_preferred: 0
portfolio_convertable: 0
sectors_basic_materials: 0
sectors_consumer_cyclical: 0
sectors_financial_services: 0
sectors_real_estate: 0
sectors_consumer_defensive: 0
sectors_healthcare: 0
sectors_utilities: 0
sectors_communication_services: 0
sectors_energy: 0
sectors_industrials: 0
sectors_technology: 0
price_earning: 0
bonds_us_government: 0
bonds_aaa: 0
bonds_aa: 0
bonds_a: 0
bonds_bbb: 0
bonds_bb: 0
bonds_b: 0
bonds_below_b: 0
bonds_others: 0
morningstar_return_rating: 0
returns_ytd: 92
returns_2017: 1702
returns_2016: 2651
returns_2015: 3756
returns_2014: 4795
returns_2013: 5857
returns_2012: 6868
returns_2011: 7655
returns_2010: 8427
morningstar_risk_rating: 0
alpha_3y: 1618
beta_3y: 1618
mean_annual_return_3y: 1618
standard_deviation_3y: 1618
shar

In [18]:
for col in nominal:
    if train_df[col].isna().sum() != 0:
        train_df[col] = train_df[col].interpolate(method="piecewise_polynomial")

In [19]:
print("Missing Values in Nominal Attributes")
for col in nominal:
    print("{}: {}".format(col, train_df[col].isna().sum()))

Missing Values in Nominal Attributes
Serial Number: 0
total_net_assets: 0
net_annual_expenses_ratio: 0
morningstar_rating: 0
portfolio_cash: 0
portfolio_stocks: 0
portfolio_bonds: 0
portfolio_others: 0
portfolio_preferred: 0
portfolio_convertable: 0
sectors_basic_materials: 0
sectors_consumer_cyclical: 0
sectors_financial_services: 0
sectors_real_estate: 0
sectors_consumer_defensive: 0
sectors_healthcare: 0
sectors_utilities: 0
sectors_communication_services: 0
sectors_energy: 0
sectors_industrials: 0
sectors_technology: 0
price_earning: 0
bonds_us_government: 0
bonds_aaa: 0
bonds_aa: 0
bonds_a: 0
bonds_bbb: 0
bonds_bb: 0
bonds_b: 0
bonds_below_b: 0
bonds_others: 0
morningstar_return_rating: 0
returns_ytd: 0
returns_2017: 0
returns_2016: 0
returns_2015: 6
returns_2014: 10
returns_2013: 10
returns_2012: 10
returns_2011: 10
returns_2010: 10
morningstar_risk_rating: 0
alpha_3y: 0
beta_3y: 0
mean_annual_return_3y: 0
standard_deviation_3y: 0
sharpe_ratio_3y: 0
treynor_ratio_3y: 0


In [20]:
for col in nominal:
    if train_df[col].isna().sum() != 0:
        train_df[col] = train_df[col].interpolate(method = "linear")

In [21]:
print("Missing Values in Nominal Attributes in Test Set")

nominal_test = [x for x in test_df.columns if x not in categeory]
for col in nominal_test:
    print("{}: {}".format(col, test_df[col].isna().sum()))

Missing Values in Nominal Attributes in Test Set
Serial Number: 0
total_net_assets: 0
net_annual_expenses_ratio: 4
morningstar_rating: 0
portfolio_cash: 3
portfolio_stocks: 3
portfolio_bonds: 3
portfolio_others: 3
portfolio_preferred: 3
portfolio_convertable: 3
sectors_basic_materials: 3
sectors_consumer_cyclical: 3
sectors_financial_services: 3
sectors_real_estate: 3
sectors_consumer_defensive: 3
sectors_healthcare: 3
sectors_utilities: 3
sectors_communication_services: 3
sectors_energy: 3
sectors_industrials: 3
sectors_technology: 3
price_earning: 3
bonds_us_government: 3
morningstar_return_rating: 0
returns_ytd: 22
returns_2017: 782
returns_2016: 1369
returns_2015: 1963
returns_2014: 2401
returns_2013: 2696
returns_2012: 3062
returns_2011: 3283
returns_2010: 3568
morningstar_risk_rating: 0
alpha_3y: 787
beta_3y: 787
mean_annual_return_3y: 787
standard_deviation_3y: 787
sharpe_ratio_3y: 787
treynor_ratio_3y: 787


In [22]:
for col in nominal_test:
    if test_df[col].isna().sum() != 0:
        test_df[col] = test_df[col].interpolate(method="piecewise_polynomial")

In [23]:
print("Missing Values in Nominal Attributes in Test Set")

nominal_test = [x for x in test_df.columns if x not in categeory]
for col in nominal_test:
    print("{}: {}".format(col, test_df[col].isna().sum()))

Missing Values in Nominal Attributes in Test Set
Serial Number: 0
total_net_assets: 0
net_annual_expenses_ratio: 0
morningstar_rating: 0
portfolio_cash: 0
portfolio_stocks: 0
portfolio_bonds: 0
portfolio_others: 0
portfolio_preferred: 0
portfolio_convertable: 0
sectors_basic_materials: 0
sectors_consumer_cyclical: 0
sectors_financial_services: 0
sectors_real_estate: 0
sectors_consumer_defensive: 0
sectors_healthcare: 0
sectors_utilities: 0
sectors_communication_services: 0
sectors_energy: 0
sectors_industrials: 0
sectors_technology: 0
price_earning: 0
bonds_us_government: 0
morningstar_return_rating: 0
returns_ytd: 0
returns_2017: 0
returns_2016: 0
returns_2015: 7
returns_2014: 8
returns_2013: 8
returns_2012: 8
returns_2011: 9
returns_2010: 9
morningstar_risk_rating: 0
alpha_3y: 4
beta_3y: 4
mean_annual_return_3y: 4
standard_deviation_3y: 4
sharpe_ratio_3y: 4
treynor_ratio_3y: 4


In [24]:
for col in nominal_test:
    if test_df[col].isna().sum() != 0:
        test_df[col] = test_df[col].interpolate(method = "linear")

In [25]:
print("Missing Values in Nominal Attributes in Test Set")

for col in nominal_test:
    print("{}: {}".format(col, test_df[col].isna().sum()))

Missing Values in Nominal Attributes in Test Set
Serial Number: 0
total_net_assets: 0
net_annual_expenses_ratio: 0
morningstar_rating: 0
portfolio_cash: 0
portfolio_stocks: 0
portfolio_bonds: 0
portfolio_others: 0
portfolio_preferred: 0
portfolio_convertable: 0
sectors_basic_materials: 0
sectors_consumer_cyclical: 0
sectors_financial_services: 0
sectors_real_estate: 0
sectors_consumer_defensive: 0
sectors_healthcare: 0
sectors_utilities: 0
sectors_communication_services: 0
sectors_energy: 0
sectors_industrials: 0
sectors_technology: 0
price_earning: 0
bonds_us_government: 0
morningstar_return_rating: 0
returns_ytd: 0
returns_2017: 0
returns_2016: 0
returns_2015: 3
returns_2014: 3
returns_2013: 3
returns_2012: 3
returns_2011: 3
returns_2010: 3
morningstar_risk_rating: 0
alpha_3y: 0
beta_3y: 0
mean_annual_return_3y: 0
standard_deviation_3y: 0
sharpe_ratio_3y: 0
treynor_ratio_3y: 0


In [26]:
end_attr = [x for x in test_df.columns if test_df[x].isna().sum()]

In [27]:
#Those values which can't be interpolated are filled with mean value of the column

for x in end_attr:
    test_df.set_value(test_df[x].isna(), x, test_df[x].mean())

  after removing the cwd from sys.path.


# Assigining Numerical Labels to Categeorical Values

In [28]:
class LabelEncoder:
    def __init__(self):
        self.count = 0
        self.labels = {}
        
    def fit(self, y):
        for x in list(set(y)):
            self.labels[x] = self.count
            self.count +=1
    
    def transform(self, y):
        mask = [x in list(self.labels.keys()) for x in y]
        
        for i in range(len(y)):
            if mask[i] is True:
                y[i] = self.labels[y[i]]
            else:
                self.labels[y[i]] = self.count
                y[i] = self.labels[y[i]]
                mask[i] = True
                self.count += 1
        return y

In [29]:
train_df_cp = train_df.copy(deep=True)

In [30]:
cache_mapping = {}
for x in categeory:
    mapping = LabelEncoder()
    cache_mapping[x] = mapping
    mapping.fit(train_df[x].values)
    train_df[x] = mapping.transform(train_df[x].values)

In [31]:
for x in categeory:
    mapping = cache_mapping[x]
    test_df[x] = mapping.transform(test_df[x].values)

   # Feathure Selection Stage 1

In [32]:
test_cols = test_df.columns

In [33]:
xtra_attr = [col for col in all_attr if col not in test_cols]
xtra_attr

['bonds_aaa',
 'bonds_aa',
 'bonds_a',
 'bonds_bbb',
 'bonds_bb',
 'bonds_b',
 'bonds_below_b',
 'bonds_others']

In [34]:
Y = train_df["bonds_aaa"].values

In [35]:
for col in xtra_attr:
    train_df = train_df.drop(labels=col, axis=1)

In [36]:
X = train_df.values

In [37]:
X_FIN_TEST = test_df.values

In [38]:
X.shape

(16767, 45)

In [39]:
X_FIN_TEST.shape

(7621, 45)

# Normalization

In [40]:
from sklearn.preprocessing import StandardScaler

In [41]:
std_x = StandardScaler()
X_std = std_x.fit_transform(X)



In [42]:
std_y = StandardScaler()
Y_std = std_y.fit_transform(Y.reshape(1, -1))

In [43]:
X_FIN_std = std_x.transform(X_FIN_TEST)



# Feathure Selection Stage 2

In [44]:
from sklearn.decomposition import PCA

In [45]:
pca_x = PCA()

In [46]:
pca_x.fit(X_std)

PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [47]:
pca_x.explained_variance_ratio_.round(decimals=2)

array([0.17, 0.08, 0.06, 0.04, 0.04, 0.04, 0.04, 0.03, 0.03, 0.03, 0.03,
       0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02,
       0.02, 0.02, 0.02, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01,
       0.01, 0.01, 0.01, 0.01, 0.01, 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  ])

In [48]:
X_ = X_std[:,:-10]

In [49]:
X_FIN_TEST_ = X_FIN_std[:,:-10]

In [50]:
def transform(Y_cat):

    for i in range(len(Y_cat)):
        if Y_cat[i] <50:
            Y_cat[i] = 0
        elif Y_cat[i] >=100:
            Y_cat[i] = 2
        else:
            Y_cat[i] = 1
    return Y_cat.astype(np.int16)

# Data Splitting

In [51]:
from sklearn.model_selection import train_test_split

In [52]:
X_train, X_test, y_train, y_test = train_test_split(X_, Y, test_size=0.33, random_state=42, shuffle=True)

# Machine Learning

In [53]:
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.metrics import accuracy_score, mean_squared_error

In [55]:
#for _ in range(20): 

regressor = MLPRegressor(hidden_layer_sizes=(91,20,), activation="relu", max_iter=500, random_state=8)
regressor.fit(X_train, y_train)

y_pred = regressor.predict(X_test)
print("For {}".format(0))
print("Accuracy Score {}".format(accuracy_score(transform(y_test), transform(y_pred))))
print("Mean Squared Error: {}\n".format(mean_squared_error(y_test, y_pred)))

For 0
Accuracy Score 0.946331767256957
Mean Squared Error: 0.06125767979761475

