In [3]:
import pandas as pd
import numpy as np
from ggplot import *
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

In [4]:
train_df = pd.read_csv("Yes_Bank_Train.csv",)
test_df = pd.read_csv("Yes_Bank_Test_Data.csv")
sample = pd.read_csv("Yes_Bank_sample_submission.csv")

In [5]:
print("Train Set Shape: {}".format(train_df.shape))
print("Test Set Shape: {}".format(test_df.shape))

Train Set Shape: (17773, 56)
Test Set Shape: (7621, 48)


# Removing Attributes with completly different or uniform distribution

In [6]:
train_df = train_df.drop(labels="fund_symbol", axis=1)
train_df = train_df.drop(labels="fund_name", axis=1)
train_df = train_df.drop(labels="currency", axis=1)

test_df = test_df.drop(labels="fund_symbol", axis=1)
test_df = test_df.drop(labels="fund_name", axis=1)
test_df = test_df.drop(labels="currency", axis=1)

# Data Cleaning

In [7]:
nominal = train_df.describe().columns
all_attr = train_df.columns
categeory = [x for x in all_attr if x not in nominal]

In [8]:
categeory

['category', 'fund_family', 'investment', 'size', 'inception_date']

In [9]:
for x in categeory:
    print("{}: {}".format(x, len(set(train_df[x]))))

category: 108
fund_family: 587
investment: 4
size: 4
inception_date: 3418


In [10]:
for x in categeory:
    print("{}: {}".format(x, len(set(test_df[x]))))

category: 110
fund_family: 312
investment: 4
size: 4
inception_date: 2110


In [11]:
print("Missing Values in Categorical Attributes")
for col in categeory:
    print("{}: {}".format(col, train_df[col].isna().sum()))

Missing Values in Categorical Attributes
category: 0
fund_family: 0
investment: 990
size: 990
inception_date: 0


In [12]:
# This has removed those records where categorical values were missed.

train_df = train_df[~train_df["investment"].isna()]

In [13]:
print("Missing Values in Categorical Attributes")
for col in categeory:
    print("{}: {}".format(col, train_df[col].isna().sum()))

Missing Values in Categorical Attributes
category: 0
fund_family: 0
investment: 0
size: 0
inception_date: 0


In [14]:
print("Missing Values in Nominal Attributes")
for col in nominal:
    print("{}: {}".format(col, train_df[col].isna().sum()))

Missing Values in Nominal Attributes
Serial Number: 0
total_net_assets: 0
net_annual_expenses_ratio: 12
morningstar_rating: 0
portfolio_cash: 4
portfolio_stocks: 4
portfolio_bonds: 4
portfolio_others: 4
portfolio_preferred: 4
portfolio_convertable: 4
sectors_basic_materials: 4
sectors_consumer_cyclical: 4
sectors_financial_services: 4
sectors_real_estate: 4
sectors_consumer_defensive: 4
sectors_healthcare: 4
sectors_utilities: 4
sectors_communication_services: 4
sectors_energy: 4
sectors_industrials: 4
sectors_technology: 4
price_earning: 4
bonds_us_government: 4
bonds_aaa: 4
bonds_aa: 4
bonds_a: 4
bonds_bbb: 4
bonds_bb: 4
bonds_b: 4
bonds_below_b: 4
bonds_others: 4
morningstar_return_rating: 0
returns_ytd: 93
returns_2017: 1715
returns_2016: 2664
returns_2015: 3769
returns_2014: 4808
returns_2013: 5870
returns_2012: 6881
returns_2011: 7668
returns_2010: 8440
morningstar_risk_rating: 0
alpha_3y: 1631
beta_3y: 1631
mean_annual_return_3y: 1631
standard_deviation_3y: 1631
sharpe_ratio_3y:

In [15]:
# This has removed those records where most of the values were null for a record.

train_df = train_df[~train_df["portfolio_cash"].isna()]
train_df = train_df[~train_df["net_annual_expenses_ratio"].isna()]

In [16]:
print("Missing Values in Nominal Attributes")
for col in nominal:
    print("{}: {}".format(col, train_df[col].isna().sum()))

Missing Values in Nominal Attributes
Serial Number: 0
total_net_assets: 0
net_annual_expenses_ratio: 0
morningstar_rating: 0
portfolio_cash: 0
portfolio_stocks: 0
portfolio_bonds: 0
portfolio_others: 0
portfolio_preferred: 0
portfolio_convertable: 0
sectors_basic_materials: 0
sectors_consumer_cyclical: 0
sectors_financial_services: 0
sectors_real_estate: 0
sectors_consumer_defensive: 0
sectors_healthcare: 0
sectors_utilities: 0
sectors_communication_services: 0
sectors_energy: 0
sectors_industrials: 0
sectors_technology: 0
price_earning: 0
bonds_us_government: 0
bonds_aaa: 0
bonds_aa: 0
bonds_a: 0
bonds_bbb: 0
bonds_bb: 0
bonds_b: 0
bonds_below_b: 0
bonds_others: 0
morningstar_return_rating: 0
returns_ytd: 92
returns_2017: 1702
returns_2016: 2651
returns_2015: 3756
returns_2014: 4795
returns_2013: 5857
returns_2012: 6868
returns_2011: 7655
returns_2010: 8427
morningstar_risk_rating: 0
alpha_3y: 1618
beta_3y: 1618
mean_annual_return_3y: 1618
standard_deviation_3y: 1618
sharpe_ratio_3y: 

In [17]:
for col in nominal:
    if train_df[col].isna().sum() != 0:
        train_df[col] = train_df[col].interpolate(method="piecewise_polynomial")

In [18]:
print("Missing Values in Nominal Attributes")
for col in nominal:
    print("{}: {}".format(col, train_df[col].isna().sum()))

Missing Values in Nominal Attributes
Serial Number: 0
total_net_assets: 0
net_annual_expenses_ratio: 0
morningstar_rating: 0
portfolio_cash: 0
portfolio_stocks: 0
portfolio_bonds: 0
portfolio_others: 0
portfolio_preferred: 0
portfolio_convertable: 0
sectors_basic_materials: 0
sectors_consumer_cyclical: 0
sectors_financial_services: 0
sectors_real_estate: 0
sectors_consumer_defensive: 0
sectors_healthcare: 0
sectors_utilities: 0
sectors_communication_services: 0
sectors_energy: 0
sectors_industrials: 0
sectors_technology: 0
price_earning: 0
bonds_us_government: 0
bonds_aaa: 0
bonds_aa: 0
bonds_a: 0
bonds_bbb: 0
bonds_bb: 0
bonds_b: 0
bonds_below_b: 0
bonds_others: 0
morningstar_return_rating: 0
returns_ytd: 0
returns_2017: 0
returns_2016: 0
returns_2015: 6
returns_2014: 10
returns_2013: 10
returns_2012: 10
returns_2011: 10
returns_2010: 10
morningstar_risk_rating: 0
alpha_3y: 0
beta_3y: 0
mean_annual_return_3y: 0
standard_deviation_3y: 0
sharpe_ratio_3y: 0
treynor_ratio_3y: 0


In [19]:
for col in nominal:
    if train_df[col].isna().sum() != 0:
        train_df[col] = train_df[col].interpolate(method = "linear")

# Assigining Numerical Labels to Categeorical Values

In [20]:
train_df_cp = train_df.copy(deep=True)

In [21]:
from sklearn.preprocessing import LabelEncoder

In [22]:
cache_mapping = {}
for x in categeory:
    mapping = LabelEncoder()
    cache_mapping[x] = mapping
    train_df[x] = mapping.fit_transform(train_df[x])

In [23]:
for x in categeory:
    print(x)
    mapping = cache_mapping[x]
    test_df[x] = mapping.transform(tr_df[x])

category


ValueError: y contains new labels: ['Commodities Precious Metals' 'Trading--Inverse Commodities'
 'Trading--Inverse Debt' 'Trading--Inverse Equity'
 'Trading--Miscellaneous']

   # Feathure Selection Stage 1

In [None]:
test_cols = test_df.columns

In [None]:
xtra_attr = [col for col in all_attr if col not in test_cols]
xtra_attr

In [None]:
Y = train_df["bonds_aaa"].values

In [None]:
for col in xtra_attr:
    train_df = train_df.drop(labels=col, axis=1)

In [None]:
X = train_df.values

# Normalization

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
std_x = StandardScaler()
X_std = std_x.fit_transform(X)

In [None]:
std_y = StandardScaler()
Y_std = std_y.fit_transform(Y.reshape(1, -1))

In [None]:
XX_test_std = std_x.fit_transform(test_df)

# Feathure Selection Stage 2

In [None]:
from sklearn.decomposition import PCA

In [None]:
pca_x = PCA()

In [None]:
pca_x.fit(X_std)

In [None]:
pca_x.explained_variance_ratio_.round(decimals=2)

In [None]:
X_ = X_std[:,:-10]

In [None]:
def transform(Y_cat):

    for i in range(len(Y_cat)):
        if Y_cat[i] <50:
            Y_cat[i] = 0
        elif Y_cat[i] >=100:
            Y_cat[i] = 2
        else:
            Y_cat[i] = 1
    return Y_cat.astype(np.int16)

# Data Splitting

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_, Y, test_size=0.33, random_state=42, shuffle=True)

# Machine Learning

In [None]:
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.metrics import accuracy_score, mean_squared_error

In [None]:
#for _ in range(20): 

regressor = MLPRegressor(hidden_layer_sizes=(91,20,), activation="relu", max_iter=500, random_state=8)
regressor.fit(X_train, y_train)

y_pred = regressor.predict(X_test)
print("For {}".format(i))
print("Accuracy Score {}".format(accuracy_score(transform(y_test), transform(y_pred))))
print("Mean Squared Error: {}\n".format(mean_squared_error(y_test, y_pred)))

In [None]:
train_df.head()

In [None]:
temp1= list(set(train_df["fund_name"]))

In [None]:
temp2 = list(set(test_df["fund_name"]))

In [None]:
len([x for x in temp2 if x not in temp1])