In [1]:
import pandas as pd 
import numpy as np 

from sklearn.model_selection import train_test_split 
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score 
from sklearn.metrics import confusion_matrix 
from sklearn import tree

from sklearn.tree import DecisionTreeClassifier 
import graphviz 


In [2]:
def import_data():
    data= pd.read_csv('weatherAUS.csv')
    
    # Drop records where target RainTomorrow=NaN
    data= data[pd.isnull(data['RainTomorrow'])==False]
    
    # For other columns with missing values, fill them in with column mean
    data=data.fillna(data.mean())
    
    # Create a flag for RainToday and RainTomorrow, note RainTomorrowFlag will be our target variable
    data['RainTodayFlag']=data['RainToday'].apply(lambda x: 1 if x=='Yes' else 0)
    data['RainTomorrowFlag']=data['RainTomorrow'].apply(lambda x: 1 if x=='Yes' else 0)
    
    print(data.head())
    return data

In [3]:
def splitdataset(data):
    
    #Separating the target variable
    X=data[['MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine', 'WindGustSpeed', 
      'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm', 'Pressure9am',  
      'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am', 'Temp3pm', 'RainTodayFlag']]
    y=data['RainTomorrowFlag'].values 
    
    # Splitting the dataset into train and test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
    
    return X, y, X_train, X_test, y_train, y_test


In [4]:
def train_using_gini(X_train, y_train, criterion, splitter, mdepth, clweight, minleaf):
    
    # Creating the classifier object 
    clf_gini = tree.DecisionTreeClassifier(criterion=criterion, 
                                        splitter=splitter, 
                                        max_depth=mdepth,
                                        class_weight=clweight,
                                        min_samples_leaf=minleaf, 
                                        random_state=0 )
  
    # Performing training 
    clf_gini.fit(X_train, y_train) 
     

    return clf_gini

In [5]:
def tree_summary(clf_gini):
    print('*************** Tree Summary ***************')
    print('Classes: ', clf_gini.classes_)
    print('Tree Depth: ', clf_gini.tree_.max_depth)
    print('No. of leaves: ', clf_gini.tree_.n_leaves)
    print('No. of features: ', clf_gini.n_features_)
    print('--------------------------------------------------------')
    print("")   
    

In [6]:
# Function to make predictions 
def prediction(X_test, clf_gini): 
  
    # Predicton on test with giniIndex 
    y_pred = clf_gini.predict(X_test) 
    print("Predicted values:") 
    print(y_pred) 
    return y_pred 

In [7]:
# Function to calculate accuracy
def cal_accuracy(y_test, y_pred): 
      
    print('*******************Confusion matrix *******************')
    print("Confusion Matrix: ", confusion_matrix(y_test, y_pred)) 
    print('--------------------------------------------------------')
    print("")
    
    print('*************** Evaluation on Test Data ***************')
    print ("Accuracy : ", 
    accuracy_score(y_test,y_pred)*100) 
      
    print("Report : ", 
    classification_report(y_test, y_pred))
    print('--------------------------------------------------------')
    print("")

In [8]:
def dtree_render(clf_gini, X):
    
    feature= X.columns
    target= [str(list(clf_gini.classes_)[0]), str(list(clf_gini.classes_)[1])]
    
    dot_data = tree.export_graphviz(clf_gini, out_file=None, 
                                feature_names= feature, 
                                class_names= target,
                                filled=True, 
                                rounded=True
                               ) 
    
    graph = graphviz.Source(dot_data)
    
    
    graph.format = 'png'
    graph.render('dtree_render',view=True)
    
    return graph

In [9]:
def main():
    
    # Building Phase 
    data = import_data() 
    
    X, Y, X_train, X_test, y_train, y_test = splitdataset(data)
    
    clf_gini = train_using_gini(X_train, y_train, 'gini','best', 
                                mdepth= 3, 
                                clweight=None,
                                minleaf=1000)
    
    # Tree Summary
    tree_summary(clf_gini)
     
    # Prediction using gini 
    
    y_pred = prediction(X_test, clf_gini) 
    cal_accuracy(y_test, y_pred)
    
        
    graph= dtree_render(clf_gini, X)   
    

In [10]:
#Calling main function
if __name__=="__main__": 
    main() 

         Date Location  MinTemp  MaxTemp  Rainfall  Evaporation  Sunshine  \
0  2008-12-01   Albury     13.4     22.9       0.6     5.469824  7.624853   
1  2008-12-02   Albury      7.4     25.1       0.0     5.469824  7.624853   
2  2008-12-03   Albury     12.9     25.7       0.0     5.469824  7.624853   
3  2008-12-04   Albury      9.2     28.0       0.0     5.469824  7.624853   
4  2008-12-05   Albury     17.5     32.3       1.0     5.469824  7.624853   

  WindGustDir  WindGustSpeed WindDir9am  ... Pressure9am  Pressure3pm  \
0           W           44.0          W  ...      1007.7       1007.1   
1         WNW           44.0        NNW  ...      1010.6       1007.8   
2         WSW           46.0          W  ...      1007.6       1008.7   
3          NE           24.0         SE  ...      1017.6       1012.8   
4           W           41.0        ENE  ...      1010.8       1006.0   

   Cloud9am  Cloud3pm  Temp9am  Temp3pm  RainToday  RainTomorrow  \
0  8.000000  4.503167     16.9