In [1]:
#imports
import pandas as pd
import numpy as np

Get Data from external sources

In [2]:
from sklearn import datasets
iris = datasets.load_iris()
print(type(iris))
X, y = iris.data, iris.target

<class 'sklearn.datasets.base.Bunch'>


In [5]:
def fnHandleMsg(msg,msgType=""):
    print(msg)


def fnAssignColnames(df,colNames=None):
    if colNames is not None and len(colNames)==len(df.columns):
        df.columns=colNames

    return df

    
    
def fnCreatePdFrameFromArray(arrayTuple,colNames=None):
    #return pd.DataFrame(np.column_stack(arrayTuple))
    df=None
    for a in arrayTuple:
        if len(a.shape)==1:
            n=pd.Series(a)
        else:
            n=pd.DataFrame(a)
        if df is None:
            df=pd.DataFrame(n)
        else:
            df=pd.concat([df,n],axis=1)

    return df

def fnGetDf(data,colNames=None):
    df=fnCreatePdFrameFromArray(data,colNames)
    df=fnAssignColnames(df,colNames)
    return df
                        
def fnFilterMissingValues(df):
    missingValues=df.isnull().sum(axis=1)
    missingValuesCount=sum(missingValues>0)
    fnHandleMsg("No. of Missing Values in DF : "+str(missingValuesCount))
    df=df.dropna()
    return df,missingValuesCount

def fnGetUniqueValLenColumns(df):
    iDfLen=len(df)
    lUniqueValCount=[]
    for col in df.columns:
        iUniqueValLen=len(np.unique(df[col]))
        lUniqueValCount.append(iUniqueValLen)
        
    return lUniqueValCount


def fnNormalizeCols(df,colNames):
    fnHandleMsg("Normalizing Values in DF for columns: "+str(colNames))
    from sklearn import preprocessing
    dfNew = pd.DataFrame(df[colNames])
    min_max_scaler = preprocessing.MinMaxScaler()
    np_scaled = min_max_scaler.fit_transform(dfNew)
    df_normalized = pd.DataFrame(np_scaled)
    df_normalized.columns=colNames
    df[colNames]=df_normalized
    #for col in colNames:
        #df[col]=df_normalized[col]
        
    return df

def fnPlotCols(df,colNames=None,num_bins=50):
    import matplotlib.pyplot as plt

    if colNames is None:
        colNames=df.columns
        
    for colName in colNames:
        colData=df[colName]
        fnPlotHistogram(colData,colName)
        fnPlotBoxplot(colData,colName)
        

def fnPlotHistogram(colData,colName,num_bins=50):
    mu=np.mean(colData)
    sigma=np.std(colData)


    fig, ax = plt.subplots()

    # the histogram of the data
    n, bins, patches = ax.hist(colData, num_bins, normed=True)

    # add a 'best fit' line
    y = ((1 / (np.sqrt(2 * np.pi) * sigma)) *np.exp(-0.5 * (1 / sigma * (bins - mu))**2))
    ax.plot(bins, y, '--')
    ax.set_xlabel('Smarts')
    ax.set_ylabel('Probability density')
    ax.set_title(r'Histogram of {0}: mu={1}, sigma={2}'.format(colName,mu,sigma))

    # Tweak spacing to prevent clipping of ylabel
    fig.tight_layout()
    plt.show()

def fnPlotBoxplot(colData,colName):
    colData=df[colName]
    fig1, ax1 = plt.subplots()
    ax1.set_title(r'Box Plot of {0}'.format(colName))
    ax1.boxplot(colData)

    # Tweak spacing to prevent clipping of ylabel
    fig.tight_layout()
    plt.show()
    

def GetTrainTest(df,testPer=0.2,seed=200):
    np.random.seed(seed)
    perm = np.random.permutation(df.index)
    df=df.loc[perm]
    df.reset_index(inplace=True)
    shuffled = df.sample(frac=1)
    trainsize = int(len(shuffled) * (1-testPer))

    df_train = shuffled.iloc[:trainsize, :]
    df_test = shuffled.iloc[trainsize:, :]
    return df_train,df_test
    
    
    
def GetkFoldedTrainVal(df,nFolds=5,valPer=20,randomstate=300,seed=200):    
    from sklearn.model_selection import KFold
    kf = KFold(n_splits=nFolds,shuffle=True,random_state=randomstate)
    """
    np.random.seed(seed)
    perm = np.random.permutation(df.index)
    df=df.loc[perm]
    df.reset_index(inplace=True)
    """
    nSet=0
    for train_index, test_index in kf.split(df):
        trainDF=df.iloc[train_index]
        valDF=df.iloc[test_index]
        yield trainDF,valDF
            
            
     
            
    


Convert Data into Pandas Dataframe

In [6]:
outputVarName='fType'
df=fnGetDf((X,y),("SL","SH","PL","PH",outputVarName))

print(df.head())

    SL   SH   PL   PH  fType
0  5.1  3.5  1.4  0.2      0
1  4.9  3.0  1.4  0.2      0
2  4.7  3.2  1.3  0.2      0
3  4.6  3.1  1.5  0.2      0
4  5.0  3.6  1.4  0.2      0


DataSet Exploration

In [7]:
print(df.describe())

               SL          SH          PL          PH       fType
count  150.000000  150.000000  150.000000  150.000000  150.000000
mean     5.843333    3.054000    3.758667    1.198667    1.000000
std      0.828066    0.433594    1.764420    0.763161    0.819232
min      4.300000    2.000000    1.000000    0.100000    0.000000
25%      5.100000    2.800000    1.600000    0.300000    0.000000
50%      5.800000    3.000000    4.350000    1.300000    1.000000
75%      6.400000    3.300000    5.100000    1.800000    2.000000
max      7.900000    4.400000    6.900000    2.500000    2.000000


In [8]:
df,missingValuesCount=fnFilterMissingValues(df)

No. of Missing Values in DF : 0


In [9]:
lUniqueValCount=fnGetUniqueValLenColumns(df)
print(list(zip(lUniqueValCount,df.dtypes)))

[(35, dtype('float64')), (23, dtype('float64')), (43, dtype('float64')), (22, dtype('float64')), (3, dtype('int32'))]


In [10]:
print(df.describe())
colNames=["SH","SL","PL","PH"]
df=fnNormalizeCols(df,colNames)
print(df.describe())

               SL          SH          PL          PH       fType
count  150.000000  150.000000  150.000000  150.000000  150.000000
mean     5.843333    3.054000    3.758667    1.198667    1.000000
std      0.828066    0.433594    1.764420    0.763161    0.819232
min      4.300000    2.000000    1.000000    0.100000    0.000000
25%      5.100000    2.800000    1.600000    0.300000    0.000000
50%      5.800000    3.000000    4.350000    1.300000    1.000000
75%      6.400000    3.300000    5.100000    1.800000    2.000000
max      7.900000    4.400000    6.900000    2.500000    2.000000
Normalizing Values in DF for columns: ['SH', 'SL', 'PL', 'PH']
               SL          SH          PL          PH       fType
count  150.000000  150.000000  150.000000  150.000000  150.000000
mean     0.428704    0.439167    0.467571    0.457778    1.000000
std      0.230018    0.180664    0.299054    0.317984    0.819232
min      0.000000    0.000000    0.000000    0.000000    0.000000
25%      0.22

In [11]:
#fnPlotCols(df,colNames)

In [19]:
print(df[outputVarName].value_counts())
trainValDF,testDF=GetTrainTest(df)
#GetkFoldedTrainValTest(df)

2    50
1    50
0    50
Name: fType, dtype: int64


In [21]:
for trainDF,testDF in GetkFoldedTrainVal(trainValDF):
    print(len(trainDF),len(testDF))
    #DO ML

96 24
96 24
96 24
96 24
96 24
