In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import pandas_datareader.data as web
import numpy as np
from scipy import stats
from sklearn import preprocessing
from sklearn.datasets import load_digits
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import time
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.svm import SVC

In [2]:
def get_requirements():
    print("Artificial Intelligence 2")
    print("Mitchell Mujwit")
    print("\nProgram Requirements:")
    print("1. Build machine learning model")
    print("2. Predict if it will rain tomorrow by analyzing past data")
    print("3. Import necessary libraries")
    print("4. Research how to install any missing packages, if necessary.")
    print("5. Create at least 3 functions called by the program.")
    print("\na. main(): calls at least two other functions.")
    print("\nb. get_requirements(): displays the program requirements.")
    print("\nc. artificial_intelligence_2(): displays the following data.")
    print("\n6. Data set url (do not use downloaded file): https://rattle.togaware.com/weatherAUS.csv")
    print("\n7. When running program:")
    print("\na. Document any issues.")
    print("\tb. Document solutions attempted.")
    print("\n8. Algorithms used (identify each): Logistic Regression, Random Forest, Decision Tree. Support Vector Machine:")
    print("\ta. Advantages")
    print("\tb. Disadvantages")
    print("\tc. How did each compare?")

In [3]:
def main():
    df = pd.read_csv('https://rattle.togaware.com/weatherAUS.csv')
    print("\n1: Print indexes")
    print(df.index)
    print("\n2: Print columns")
    print(df.columns)
    print("\n3: Print data frame")
    print(df.head(5))
    print(df.tail(5))
    print("\n4: Print type")
    print(type(df))
    print("\n5: Print data attribute type")
    print(df.dtypes)
    print("\n6: Print values in array format")
    print(df.values)
    print("\n7: Print index of all column names")
    print(df.columns)
    print("\n8: Print data frame info")
    print("\nPrint summary stats")
    print(df.info(verbose=True))
    print("\n9: Number of rows")
    print(df.shape[0])
    print("\n10: Number of columns")
    print(df.shape[1])
    print("\n11: Number of rows and columns")
    print(df.shape)
    print("\n12: Number of elements")
    print(df.size)
    print("\n14: Find null values")
    print(df.count().sort_values())
    print("\n15: Cleaning data")
    df = df.drop(columns=['Sunshine', 'Evaporation', 
                          'Cloud3pm', 'Cloud9am', 'Location', 'Date', 'RISK_MM'], axis=1)
    print("\n16: Print new number o datafram rows and columns")
    print(df.shape)
    print("\n17: Remove null values in df then print number of dataframe rows and columns")
    df = df.dropna(how='any')
    print(df.shape)
    print("\n18: Remove outliers using z score")
    z = np.abs(stats.zscore(df.select_dtypes(include=np.number)))
    df = df[(z < 3). all(axis=1)]
    print("\n19: Print new number of dataframe rows and columbs")
    print(df.shape)
    print("\n20: Modify columns yes=1, no=2")
    print(df['RainToday'])
    
    df['RainToday'].replace({'No' : 0, 'Yes' : 1}, inplace=True)
    df['RainTomorrow'].replace({'No' : 0, 'Yes' : 1}, inplace=True)
    print("After modification")
    print(df['RainToday'])
    
    print("\n21: Print unique categorical column values")
    categorical_columns = ['WindGustDir', 'WindDir3pm', 'WindDir9am']
    for col in categorical_columns:
        print(np.unique(df[col]))
    
    print("\n22: Convert categorical column")
    df = pd.get_dummies(df, columns=categorical_columns)
    
    print("Before normalizing data")
    print(df.iloc[4:9])
    
    print("\n23: Normalize input values variables to eliminate biases")
    scaler = preprocessing.MinMaxScaler()
    scaler.fit(df)
    df = pd.DataFrame(scaler.transform(df), index=df.index, columns=df.columns)
    print("After normalizing")
    print(df.iloc[4:9])
    
    print("24: Exploratory dta anaylsis")
    x = df.loc[:, df.columns !='RainTomorrow']
    y = df[['RainTomorrow']]
    selector = SelectKBest(chi2, k=3)
    selector.fit(x,y)
    x_new = selector.transform(x)
    print(x.columns[selector.get_support(indices=True)])
    
    df = df[['Humidity3pm', 'Rainfall', 'RainToday', 'RainTomorrow']]
    x = df[['Humidity3pm']]
    y = df[['RainTomorrow']]
    
    print("\n25: Begin data modeling")
    
    print("Logistic Regression:")
    t0 = time.time()
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)
    clf_logreg = LogisticRegression(random_state=0)
    clf_logreg.fit(x_train, y_train.values.ravel())
    y_pred = clf_logreg.predict(x_test)
    score = accuracy_score(y_test, y_pred)
    
    print("Accuracy using logistics classifier", score)
    print("Time taken using random logistics classifier", time.time() - t0)
    
    print("\nRandom Forest:")
    t0 = time.time()
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)
    clf_rf = RandomForestClassifier(
        n_estimators=100, max_depth=4, random_state=0)
    clf_rf.fit(x_train, y_train.values.ravel())
    y_pred = clf_rf.predict(x_test)
    score = accuracy_score(y_test, y_pred)
    
    print("Accuracy using forest classifier: ", score)
    print("Time taken using random forest classifier: ", time.time() - t0)
    
    print("\nDecision Tree:")
    t0 = time.time()
    x_train, x_test, y_train, y_test, = train_test_split(x, y, test_size=0.25)
    clf_dt = DecisionTreeClassifier(random_state=0)
    clf_dt.fit(x_train, y_train.values.ravel())
    y_pred = clf_dt.predict(x_test)
    score = accuracy_score(y_test, y_pred)
    
    print("Accuracy using decision tree classifier: ", score)
    print("Time taken using decision tree classifier: ", time.time() - t0)
    
    
    print("\nSupport vector machine:")
    t0 = time.time()
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)
    clf_svc = svm.SVC(kernel='linear')
    clf_svc.fit(x_train, y_train.values.ravel())
    y_pred = clf_svc.predict(x_test)
    score = accuracy_score(y_test, y_pred)
    
    print("Accuracy using support and vector machine classifier: ", score)
    print("Time taken using vector machine classifier: ", time.time() - t0)

In [4]:
get_requirements()
main()

Artificial Intelligence 2
Mitchell Mujwit

Program Requirements:
1. Build machine learning model
2. Predict if it will rain tomorrow by analyzing past data
3. Import necessary libraries
4. Research how to install any missing packages, if necessary.
5. Create at least 3 functions called by the program.

a. main(): calls at least two other functions.

b. get_requirements(): displays the program requirements.

c. artificial_intelligence_2(): displays the following data.

6. Data set url (do not use downloaded file): https://rattle.togaware.com/weatherAUS.csv

7. When running program:

a. Document any issues.
	b. Document solutions attempted.

8. Algorithms used (identify each): Logistic Regression, Random Forest, Decision Tree. Support Vector Machine:
	a. Advantages
	b. Disadvantages
	c. How did each compare?

1: Print indexes
RangeIndex(start=0, stop=192918, step=1)

2: Print columns
Index(['Date', 'Location', 'MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation',
       'Sunshine', 'WindGustD

['E' 'ENE' 'ESE' 'N' 'NE' 'NNE' 'NNW' 'NW' 'S' 'SE' 'SSE' 'SSW' 'SW' 'W'
 'WNW' 'WSW']
['E' 'ENE' 'ESE' 'N' 'NE' 'NNE' 'NNW' 'NW' 'S' 'SE' 'SSE' 'SSW' 'SW' 'W'
 'WNW' 'WSW']
['E' 'ENE' 'ESE' 'N' 'NE' 'NNE' 'NNW' 'NW' 'S' 'SE' 'SSE' 'SSW' 'SW' 'W'
 'WNW' 'WSW']

22: Convert categorical column
Before normalizing data
   MinTemp  MaxTemp  Rainfall  WindGustSpeed  WindSpeed9am  WindSpeed3pm  \
4     17.5     32.3       1.0           41.0           7.0          20.0   
5     14.6     29.7       0.2           56.0          19.0          24.0   
6     14.3     25.0       0.0           50.0          20.0          24.0   
7      7.7     26.7       0.0           35.0           6.0          17.0   
8      9.7     31.9       0.0           80.0           7.0          28.0   

   Humidity9am  Humidity3pm  Pressure9am  Pressure3pm  ...  WindDir9am_NNW  \
4         82.0         33.0       1010.8       1006.0  ...               0   
5         55.0         23.0       1009.2       1005.4  ...            