In [None]:
#imports
import pandas as pd
import numpy as np

Get Data from external sources

In [None]:
from sklearn import datasets
iris = datasets.load_iris()
print(type(iris))
X, y = iris.data, iris.target

In [72]:
def fnHandleMsg(msg,msgType=""):
    print(msg)


def fnAssignColnames(df,colNames=None):
    if colNames is not None and len(colNames)==len(df.columns):
        df.columns=colNames

    return df

    
    
def fnCreatePdFrameFromArray(arrayTuple,colNames=None):
    #return pd.DataFrame(np.column_stack(arrayTuple))
    df=None
    for a in arrayTuple:
        if len(a.shape)==1:
            n=pd.Series(a)
        else:
            n=pd.DataFrame(a)
        if df is None:
            df=pd.DataFrame(n)
        else:
            df=pd.concat([df,n],axis=1)

    return df

def fnGetDf(data,colNames=None):
    df=fnCreatePdFrameFromArray(data,colNames)
    df=fnAssignColnames(df,colNames)
    return df
                        
def fnFilterMissingValues(df):
    missingValues=df.isnull().sum(axis=1)
    missingValuesCount=sum(missingValues>0)
    fnHandleMsg("No. of Missing Values in DF : "+str(missingValuesCount))
    df=df.dropna()
    return df,missingValuesCount

def fnGetUniqueValLenColumns(df):
    iDfLen=len(df)
    lUniqueValCount=[]
    for col in df.columns:
        iUniqueValLen=len(np.unique(df[col]))
        lUniqueValCount.append(iUniqueValLen)
        
    return lUniqueValCount


def fnNormalizeCols(df,colNames):
    fnHandleMsg("Normalizing Values in DF for columns: "+str(colNames))
    from sklearn import preprocessing
    dfNew = pd.DataFrame(df[colNames])
    min_max_scaler = preprocessing.MinMaxScaler()
    np_scaled = min_max_scaler.fit_transform(dfNew)
    df_normalized = pd.DataFrame(np_scaled)
    df_normalized.columns=colNames
    df[colNames]=df_normalized
    #for col in colNames:
        #df[col]=df_normalized[col]
        
    return df

def fnPlotCols(df,colNames=None,num_bins=50):
    import matplotlib.pyplot as plt

    if colNames is None:
        colNames=df.columns
        
    for colName in colNames:
        colData=df[colName]
        fnPlotHistogram(colData,colName)
        fnPlotBoxplot(colData,colName)
        

def fnPlotHistogram(colData,colName,num_bins=50):
    mu=np.mean(colData)
    sigma=np.std(colData)


    fig, ax = plt.subplots()

    # the histogram of the data
    n, bins, patches = ax.hist(colData, num_bins, normed=True)

    # add a 'best fit' line
    y = ((1 / (np.sqrt(2 * np.pi) * sigma)) *np.exp(-0.5 * (1 / sigma * (bins - mu))**2))
    ax.plot(bins, y, '--')
    ax.set_xlabel('Smarts')
    ax.set_ylabel('Probability density')
    ax.set_title(r'Histogram of {0}: mu={1}, sigma={2}'.format(colName,mu,sigma))

    # Tweak spacing to prevent clipping of ylabel
    fig.tight_layout()
    plt.show()

def fnPlotBoxplot(colData,colName):
    colData=df[colName]
    fig1, ax1 = plt.subplots()
    ax1.set_title(r'Box Plot of {0}'.format(colName))
    ax1.boxplot(colData)

    # Tweak spacing to prevent clipping of ylabel
    fig.tight_layout()
    plt.show()
    
    
def GetkFoldedTrainValTest(df,nFolds=5,valPer=20,testPer=20,randomstate=300,seed=200):    
    from sklearn.model_selection import KFold
    kf = KFold(n_splits=nFolds,shuffle=True,random_state=randomstate)
    """
    np.random.seed(seed)
    perm = np.random.permutation(df.index)
    df=df.loc[perm]
    df.reset_index(inplace=True)
    """
    nSet=0
    for train_index, test_index in kf.split(df):
        tempDF=df.iloc[train_index]
        testDF=df.iloc[test_index]
        kf2 = KFold(n_splits=2,random_state=randomstate)
        for train_index, val_index in kf2.split(tempDF):
            trainDF=tempDF.iloc[train_index]
            valDF=tempDF.iloc[val_index]
            nSet+=1
            print("----------Set : {0}---------".format(nSet))
            
            print(test_index)
            print(train_index)
            print(val_index)
            
     
            
    


Convert Data into Pandas Dataframe

In [62]:
outputVarName='fType'
df=fnGetDf((X,y),("SL","SH","PL","PH",outputVarName))

print(df.head())

    SL   SH   PL   PH  fType
0  5.1  3.5  1.4  0.2      0
1  4.9  3.0  1.4  0.2      0
2  4.7  3.2  1.3  0.2      0
3  4.6  3.1  1.5  0.2      0
4  5.0  3.6  1.4  0.2      0


DataSet Exploration

In [None]:
print(df.describe())

In [None]:
df,missingValuesCount=fnFilterMissingValues(df)

In [None]:
lUniqueValCount=fnGetUniqueValLenColumns(df)
print(list(zip(lUniqueValCount,df.dtypes)))

In [None]:
print(df.describe())
colNames=["SH","SL","PL","PH"]
df=fnNormalizeCols(df,colNames)
print(df.describe())

In [47]:
#fnPlotCols(df,colNames)

In [73]:
print(df[outputVarName].value_counts())
GetkFoldedTrainValTest(df)

2    50
1    50
0    50
Name: fType, dtype: int64
----------Set : 1---------
[  6   7  12  14  15  19  20  22  27  30  39  45  49  63  65  72  73  75
  81  87  90 106 112 117 120 124 134 135 143 148]
[ 60  61  62  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77
  78  79  80  81  82  83  84  85  86  87  88  89  90  91  92  93  94  95
  96  97  98  99 100 101 102 103 104 105 106 107 108 109 110 111 112 113
 114 115 116 117 118 119]
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
 48 49 50 51 52 53 54 55 56 57 58 59]
----------Set : 2---------
[  6   7  12  14  15  19  20  22  27  30  39  45  49  63  65  72  73  75
  81  87  90 106 112 117 120 124 134 135 143 148]
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
 48 49 50 51 52 53 54 55 56 57 58 59]
[ 60  61  62  63  64  65  66  67  68 