<img src="mmu_logo.png" style="height: 80px;" align=left>  

# Week 4: Learning Objectives

Towards the end of this lesson, you should be able to:
- write Python codes for association rule mining
- experimenting with laundry dataset and linear regression


## Import the Libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from apyori import apriori
import seaborn as sns 
import warnings
warnings.filterwarnings("ignore")


pd.set_option('display.max_columns', 500)

# Association Rule Mining

### Importing the Dataset

In [None]:
# load store_data.csv dataset

store_data = pd.read_csv('store_data.csv', header=None)
store_data.head()

### Data Preprocessing

The Apriori library we are going to use requires our dataset to be in the form of a list of lists, where the whole dataset is a big list and each transaction in the dataset is an inner list within the outer big list. Currently we have data in the form of a pandas dataframe. To convert our pandas dataframe into a list of lists, execute the following script:

In [None]:
# transform the original dataset into ARM friendly format

records = []

# your codes here...


### Applying Apriori Algorithm

You must set the parameters in the apriori algorithm. <br>


In [None]:
# run apriori function

# your codes here...



### Check the output

In [None]:
len(association_results)

In [None]:
print(association_results[0])

### Output in better presentation

In [None]:
cnt =0

for item in association_results:
    cnt += 1
    # first index of the inner list
    # Contains base item and add item
    pair = item[0] 
    items = [x for x in pair]
    print("(Rule " + str(cnt) + ") " + items[0] + " -> " + items[1])

    #second index of the inner list
    print("Support: " + str(round(item[1],3)))

    #third index of the list located at 0th
    #of the third index of the inner list

    print("Confidence: " + str(round(item[2][0][2],4)))
    print("Lift: " + str(round(item[2][0][3],4)))
    print("=====================================")

# Linear Regression

In [None]:
df_all = pd.DataFrame()
for i in range(4):
    filename = './data/file' + str(i+1) + ".xlsx"
    df = pd.read_excel(filename, sheet_name='Reading', skiprows = 2, usecols = 'G:AB') 
    df.drop(df.index[31:], inplace=True)
    df_all = df_all.append(df) 
df_all = df_all.fillna(0)
df_all = df_all/56*5 # this is just a data transformation to Ringgit Malaysia
df_all = df_all.round(2)
df_all.head()

In [None]:
# Transform the dataset format

df_all_melt = df_all.melt()
df_all_melt.columns = ['Item', 'Sales']
a=df_all_melt.loc[df_all_melt['Item']=='Detergent 1','Sales']
df_all_melt['Sales'].sum()

In [None]:
sns.set(rc={'figure.figsize':(11,6)})
sns.set(style="whitegrid", color_codes=True)

a = sns.stripplot(x="Item", y="Sales", data=df_all_melt)
a.set_xticklabels(a.get_xticklabels(), rotation=90)
a.set_title('Items and Sales')
a.set_ylabel('Sales')
a.set_xlabel('Items')


In [None]:
# Plot swarmplot

sns.set(rc={'figure.figsize':(11, 6)})
sns.set(style="whitegrid", color_codes=True)

# your codes here...


In [None]:
# plot pairplot

df_w123 = df_all[['W1','W2','W3']].reset_index()

# your codes here...


In [None]:
# plot heatmap to show the correlation

sns.set(rc={'figure.figsize':(11,6)})
correlation_matrix = df_all.iloc[:,:].corr().round(1)

# your codes here...


In [None]:
# Perform grouping for Item-Sales

df_itemSales = df_all_melt.groupby(["Item"]).sum() 
df_itemSales.reset_index(inplace=True)
df_itemSales.head()

In [None]:
# Plot Item-Sales barchart

sns.set(rc={'figure.figsize':(11,6)})
sns.set(style="whitegrid", color_codes=True)

b = sns.barplot(x="Item", y="Sales",  data=df_itemSales)
b.set_xticklabels(b.get_xticklabels(), rotation=90)
b.set_title('Item Sales')
b.set_ylabel('Sales')
b.set_xlabel('Items')
b.set(ylim=(0, 10000))

for p in b.patches:
    	b.annotate("%.0f" % p.get_height(), (p.get_x() + 
	p.get_width() / 2., p.get_height()), 
    	ha='center', va='center', rotation=90, 
	xytext=(0, 18), textcoords='offset points')



In [None]:
# Getting the daily sales

df_dailySales = df_all.groupby(df_all.index).sum() 
df_dailySales.reset_index(drop=True, inplace=True) 
df_dailySales = df_dailySales.sum(axis=1).to_frame() 
df_dailySales.columns = ['Sales'] 


In [None]:
# Plot daily sales

sns.set(rc={'figure.figsize':(11, 6)})
sns.set(style="whitegrid", color_codes=True)

b = sns.barplot(x=df_dailySales.index, y="Sales",  data=df_dailySales)
b.set_xticklabels(b.get_xticklabels(), rotation=0)
b.set_title('Daily Sales')
b.set_ylabel('Sales')
b.set_xlabel('Day')

b.set(ylim=(0, 5000))

for p in b.patches:
    	b.annotate("%.0f" % p.get_height(), (p.get_x() + 
	p.get_width() / 2., p.get_height()), 
    	ha='center', va='center', rotation=90, 
	xytext=(0, 18), textcoords='offset points')


In [None]:
# Get the distribution plot for VM_Sum

df_all['VM_Sum'] = df_all.loc[:, 'Detergent 1':'Bag 2'].sum(axis=1)
sns.distplot(df_all['VM_Sum'])


In [None]:
# Plot Scatterplot for 5 washers

features = ['W1', 'W2', 'W3', 'W4', 'W5', 'W6']
target = df_all['VM_Sum']

for i, col in enumerate(features):
    plt.subplot(1, len(features) , i+1)
    x = df_all[col]
    y = target
    sns.scatterplot(x,y)


# LR using statemodels 

In [None]:
# Prepare dataset for Linear Regression for W3 agains VM_Sum

import statsmodels.api as sm

X = df_all[['W3']]
Y = df_all[['VM_Sum']]


In [None]:
# Using OLS

# your codes here...


In [None]:
plt.scatter(X['W3'],Y)
plt.scatter(X['W3'],predictions, color='red')


In [None]:
# Retrieve the model summary

X = sm.add_constant(X)
model = sm.OLS(Y, X).fit()
predictions = model.predict(X)
model.summary()


In [None]:
plt.scatter(X['W3'],Y)
plt.scatter(X['W3'],predictions, color='red')


# LR using sklearn

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score


In [None]:
X = df_all.loc[:,'W3'] 
Y = df_all.loc[:,'VM_Sum'] 

plt.plot(X,Y, linestyle='',marker='o')

In [None]:
X = df_all.loc[:,['W3']] 
Y = df_all.loc[:,'VM_Sum'] 

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.20, random_state = 5)

print(X_train.shape)
print(Y_train.shape)
print(X_test.shape)
print(Y_test.shape)


In [None]:
# Linear Regression model construction

# your codes here...


In [None]:
# Get the coefficient

lm.coef_ 

In [None]:
# Get the intercept

lm.intercept_

In [None]:
# perform prediction on X_test

# your codes here...



In [None]:
# Construct a table to compare VM_Sum and PredictedSum

df_tmp = df_all.loc[:,['W3','VM_Sum']]
df_new = pd.concat([df_tmp.reset_index(drop=True), Y_test_pred], axis=1) 
df_new.head()


In [None]:
# Plot a scatterplot to compare W3 and PredictedSum and VM_Sum

plt.plot( 'W3', 'PredictedSum', data=df_new, linestyle='', marker='o')  
plt.plot( 'W3', 'VM_Sum', data=df_new, linestyle='', marker='+') 


In [None]:
# Plot X_test agains Y_test and Y_test_pred

X_test = X_test.loc[:,'W3']
Y_test_pred = Y_test_pred.squeeze()

plt.plot(X_test, Y_test, 'o')
plt.plot(X_test, Y_test_pred)
plt.show()

In [None]:
lm.coef_ 

In [None]:
lm.intercept_

In [None]:
# Get the model to predict Y_test value when X_test = 60

# your codes here...


