In [5]:
# -*- coding: utf-8 -*-
"""
Created on Wed Mar 13 09:25:14 2019
Nate Jermain
Random forest analysis for stock delineation for Chub Mackerel
"""

import pandas as pd
import numpy as np

df=pd.read_csv("C:/Users/w10007346/Dropbox/ATC shape analsis project (1)/ACM_ShapeAnalysis/Analysis/wave_shape.csv")

df.head()
orig_resp=df['pop']

####### Data Cleaning ###########

df.columns.values
len(df.columns.values)

#### Transforming skewed features
# check features for skew
skew_feats=df.drop('pop', axis=1).skew().sort_values(ascending=False)
skewness=pd.DataFrame({'Skew':skew_feats})
skewness=skewness[abs(skewness)>0.75].dropna()

# use box cox transformation
from scipy.special import boxcox1p
skewed_features=skewness.index
lam=0.15

for i in skewed_features:
    df[i]=boxcox1p(df[i],lam)

# check
df.skew().sort_values(ascending=False)# improved

##### Remove duplicate features and NAs
# remove id column
df=df.drop('Unnamed: 0', axis=1)
# response is population
resp=df['pop']
df=df.drop('pop', axis=1)

df.columns.values


# fill in missing values
df.isnull().sum().sort_values(ascending=False)

# remove features with all nas
df=df.drop(['Ws1c1','Ws1c2', 'Ws2c4'], axis=1)

df.isnull().sum().sort_values(ascending=False)

# features with small numbers of nas get filled with means
df.Ws2c3.describe()
df.Ws2c3=df.Ws2c3.fillna(df.Ws2c3.dropna().mean())

df.Ws2c1.describe()
df.Ws2c1=df.Ws2c1.fillna(df.Ws2c1.dropna().mean())

df.isnull().sum().sort_values(ascending=False)
df.Ws3c1.describe()
df.Ws3c1=df.Ws3c1.fillna(df.Ws3c1.dropna().mean())
df.Ws4c10=df.Ws4c10.fillna(df.Ws4c10.dropna().mean())
df.Ws3c4=df.Ws3c4.fillna(df.Ws3c4.dropna().mean())
df.isnull().values.any()

# create response variable as factor
resp=pd.factorize(resp)
resp=resp[0]

# Split into training and test sets
from sklearn.model_selection import train_test_split
train_X, test_X, train_y, test_y = train_test_split(df, orig_resp, random_state = 0, test_size=.3)


# make sure test set is balanced
from imblearn.under_sampling import RandomUnderSampler

untest = RandomUnderSampler(return_indices=True)
X_untest, y_untest, id_untest = untest.fit_sample(test_X, test_y)

pd.Series(y_untest).value_counts().plot('bar') # equal sampling now (check)

# prior to resampling
from sklearn.ensemble import RandomForestClassifier

# the model prior to hyperparameter optimization
Best_RFC=RandomForestClassifier(n_estimators=4000, max_features='auto', max_depth=20,
                           min_samples_split=5, min_samples_leaf=1,
                           bootstrap=True, criterion='gini')

Best_RFC.fit(train_X, train_y)

# predict test Y values
ypred=Best_RFC.predict(X_untest)

# apply to test set
from sklearn import metrics
print("Accuracy:",metrics.accuracy_score(y_untest, ypred))

# confusion matrix to evaluate predictions
pd.crosstab(y_untest, ypred, rownames=['Observed'], colnames=['Predicted'])


########### Undersampling of common classes 
import matplotlib.pyplot as plt
fig = plt.figure()
plot= pd.Series(train_y).value_counts().plot('bar', color=['green', 'blue', 'red']) # unbalanced design
fig = plot.get_figure()
fig.savefig('Imbalanced.png', dpi=300) 


from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(return_indices=True)
X_rus, y_rus, id_rus = rus.fit_sample(train_X, train_y)

pd.Series(y_rus).value_counts().plot('bar') # equal sampling now (check)

fig = plt.figure()
plot = pd.Series(y_rus).value_counts().plot('bar', color=['green', 'blue', 'red'])  
fig = plot.get_figure()
fig.savefig('oversamp.png', dpi=300) 

### MODEL

Best_RFC.fit(X_rus, y_rus)

# predict test Y values
ypred=Best_RFC.predict(X_untest)

# apply to test set
from sklearn import metrics
print("Accuracy:",metrics.accuracy_score(y_untest, ypred))

# confusion matrix to evaluate predictions
pd.crosstab(y_untest, ypred, rownames=['Observed'], colnames=['Predicted'])




######## Oversampling of rare classes  ######################
# lets try oversampling too
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler()
X_ros, y_ros = ros.fit_sample(train_X, train_y)

# equal sampling now (check)
fig = plt.figure()
plot = pd.Series(y_ros).value_counts().plot('bar', color=['green', 'blue', 'red'])  
fig = plot.get_figure()
fig.savefig('oversamp.png', dpi=300) 

# fit best model to training dataset
Best_RFC.fit(X_ros, y_ros)

# predict test Y values
ypred=Best_RFC.predict(X_untest)

# apply to test set
from sklearn import metrics
print("Accuracy:",metrics.accuracy_score(y_untest, ypred))

# confusion matrix to evaluate predictions
pd.crosstab(y_untest, ypred, rownames=['Observed'], colnames=['Predicted'])


########## Using SMOTE ##############
from imblearn.over_sampling import SMOTE
smot = SMOTE()
X_smot, y_smot = smot.fit_sample(train_X, train_y)


pd.Series(y_smot).value_counts().plot('bar') # equal sampling now (check)


# fit best model to training dataset
Best_RFC.fit(X_smot, y_smot)

# predict test Y values
ypred=Best_RFC.predict(X_untest)

# apply to test set
from sklearn import metrics
print("Accuracy:",metrics.accuracy_score(y_untest, ypred))

# confusion matrix to evaluate predictions
pd.crosstab(y_untest, ypred, rownames=['Observed'], colnames=['Predicted'])





NotFittedError: This KNeighborsClassifier instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [291]:
#reading from excel 

import pandas as pd
import numpy as np


from pandas import ExcelWriter
from pandas import ExcelFile
df=pd.read_excel('C:\\Users\\Mojtaba\\Projects\\Iris-classification\\family.xlsx', 2)
df1=pd.read_excel('C:\\Users\\Mojtaba\\Projects\\Iris-classification\\family.xlsx', 3)
# df=pd.read_excel('C:\\Users\\Mojtaba\\Projects\\Iris-classification\\family.xlsx', sheet_name='ali')
# df=df.transpose().reset_index().rename(columns={'index':'Variable'})
# df = df.T

# DF = pd.read_table(df,sep="\t",index_col = [0]).transpose() #Add index_col = [0] to not have index values as own row during transposition
# m, n = DF.shape
# DF.reset_index(drop=False, inplace=True)
# DF.head()
# df['id2'] = df.index
df.set_index('name',inplace=True)
df= df.T
# df.reset_index(drop=True, inplace=True)

# df=df.transpose()
# df.index

# df.columns = df.iloc[0]


# dff=df.to_excel(header=None)
with pd.ExcelWriter('output1.xlsx') as writer:  
    df.to_excel(writer, sheet_name='Sheet_name_1')

df=pd.read_excel('C:\\Users\\Mojtaba\\Projects\\Iris-classification\\output1.xlsx',0)
df
df.describe()
# df.describe
# df = df.set_index('name').T
# df = df.set_index('name').T.reset_index()
# df = df.set_index('name').T.rename_axis('Variable').rename_axis(None, 1).reset_index()
# df
# print (df)
# df.drop(index='Variable')
# df=df.transpose()
# df.drop(index=0)
# df.describe()
# df1.info()
# df.mean()
# df.info()
# 
# orig_resp=df['pop']

Unnamed: 0,age,Unnamed: 4,price,salary
count,6.0,0.0,5.0,6.0
mean,29.833333,,1429.8,169333.333333
std,13.242608,,407.388267,72450.442833
min,21.0,,1100.0,110000.0
25%,21.5,,1270.0,127500.0
50%,24.0,,1299.0,154000.0
75%,31.75,,1340.0,165500.0
max,55.0,,2140.0,310000.0


In [292]:
####### Data Cleaning ###########

df.columns.values
len(df.columns.values)


8

In [293]:
#### Transforming skewed features
# check features for skew
# skew_feats=df.drop('pop', axis=1).skew().sort_values(ascending=False)
skew_feats=df.skew().sort_values(ascending=False)
skewness=pd.DataFrame({'Skew':skew_feats})
skewness=skewness[abs(skewness)>0.75].dropna()
print(skewness)

            Skew
price   1.943060
salary  1.929351
age     1.834565


In [294]:
# use box cox transformation
from scipy.special import boxcox1p
skewed_features=skewness.index
lam=0.15
for i in skewed_features:
    df[i]=boxcox1p(df[i],lam)

# check
df.skew().sort_values(ascending=False)# improved

# from scipy import stats
# fer_boxcox=stats.boxcox(df['fertility'])[0]
# print(pd.Series(fer_boxcox).skew())
# sns.distplot(fer_boxcox)
# # for i in skewed_features:
# # df[i],lmbda=boxcox(df[i], lmbda= None )

price         1.750713
age           1.544847
salary        1.469501
Unnamed: 4         NaN
dtype: float64

In [295]:
##### Remove duplicate features and NAs
# remove id column
# df=df.drop('Unnamed: 0', axis=1)
# response is population
# df=df.drop([0,2])
# resp=df['come']
# df1=df.drop( df.columns[1], axis=1, inplace=True)
df1=df.drop(columns='Unnamed: 4')
# df=df.drop('come', axis=1)
print(df1)
# df
# df.columns.values

  Unnamed: 0       age  gender    state      price     salary come
0       muri  4.071754  female  married  12.967979  31.862221  yes
1       wudi  4.697012    male  married  14.395413  33.485383   no
2       jami  3.932510  female      NaN  12.395701  33.856700  yes
3     muri.1  5.527074  female  married  12.810715  31.362625  yes
4       kami  4.201461    male   single        NaN  37.756878   no
5       jizi  3.932510  female  married  12.876739  33.173658   no


In [297]:
# fill in missing values
df=df1
df.isnull().sum().sort_values(ascending=False)

# remove features with all nas
# df=df.drop(['Ws1c1','Ws1c2', 'Ws2c4'], axis=1)

df.isnull().sum().sort_values(ascending=False)
df

Unnamed: 0.1,Unnamed: 0,age,gender,state,price,salary,come
0,muri,4.071754,female,married,12.967979,31.862221,yes
1,wudi,4.697012,male,married,14.395413,33.485383,no
2,jami,3.93251,female,,12.395701,33.8567,yes
3,muri.1,5.527074,female,married,12.810715,31.362625,yes
4,kami,4.201461,male,single,,37.756878,no
5,jizi,3.93251,female,married,12.876739,33.173658,no


In [301]:
# features with small numbers of nas get filled with means
df.price.describe()
df.price=df.price.fillna(df.price.dropna().mean())
df.isnull().values.any()
df

Unnamed: 0.1,Unnamed: 0,age,gender,state,price,salary,come
0,muri,4.071754,female,married,12.967979,31.862221,yes
1,wudi,4.697012,male,married,14.395413,33.485383,no
2,jami,3.93251,female,,12.395701,33.8567,yes
3,muri.1,5.527074,female,married,12.810715,31.362625,yes
4,kami,4.201461,male,single,13.089309,37.756878,no
5,jizi,3.93251,female,married,12.876739,33.173658,no


In [304]:
# create response variable as factor
resp=pd.factorize(resp)
resp=resp[0]
resp
# how do I specify my response variable?

array([0, 1, 0, 0, 1, 1], dtype=int64)

In [306]:
# Split into training and test sets
from sklearn.model_selection import train_test_split
train_X, test_X, train_y, test_y = train_test_split(df, resp, random_state = 0, test_size=.3)
# x_train,x_test,y_train,y_test=train_test_split(x, y, test_size=0.2, random_state=7)

In [309]:

# make sure test set is balanced
from imblearn.under_sampling import RandomUnderSampler

untest = RandomUnderSampler()
X_untest, y_untest, id_untest = untest.fit_sample(test_X, test_y)

pd.Series(y_untest).value_counts().plot('bar') # equal sampling now (check)

ValueError: not enough values to unpack (expected 3, got 2)

In [310]:
# prior to resampling
from sklearn.ensemble import RandomForestClassifier

# the model prior to hyperparameter optimization
Best_RFC=RandomForestClassifier(n_estimators=4000, max_features='auto', max_depth=20,
                           min_samples_split=5, min_samples_leaf=1,
                           bootstrap=True, criterion='gini')

Best_RFC.fit(train_X, train_y)

# could not convert string to float:??

ValueError: could not convert string to float: 'wudi'