# Data Visualization
### Kanja Saha

## Steps
1. <a href='#import_lib'>Import Libraries</a>
1. <a href='#import_data'>Import Data</a>
2. <a href='#preprocess'>Preprocess Data</a>
3. <a href='#explore'>Explore Data</a>
3. <a href='#implement'>Implement Algorithms</a>

## Import Libraries
<a id='import_lib'></a>
In general, import all libraries before importing data. However, for learning purpose, import libraries in each step as needed. This is will give a better understanding of the libraries and their functions.

## Import Data
<a id='import_data'></a>
In general, import all needed libraries before importing data. If this is a learning execise, import libraries in each step as needed. This is will give a better understanding of the libraries. 

In [1]:
#import necessary libraries
import pandas as pd
from IPython.display import display

# Load the dataset into pandas dataframe
raw_data_all = pd.read_excel("Online_Retail.csv")
print ("Dataset has {} rows(samples) with {} columns(features) each.".format(*raw_data_all.shape))

# since we are looking for customer info, drop all records that has no customerid
raw_data_all=raw_data_all.dropna(subset=['CustomerID'])
raw_data=raw_data_all
# display the top 5 rows of the dataset
raw_data_all.head(5)

#summary of dataset's distribution
#raw_data_all.groupby('Description').size()
#raw_data_all[raw_data_all.Description=='thrown away']
#raw_data_all[raw_data_all.StockCode=='84611B']
raw_data_all.describe()

#items with negative quantity implies returned items, and 0 implies no purchase
#raw_data[raw_data.Quantity<=0].head(5)

from datetime import datetime
import numpy as np
import pandas as pd

LastTransactionDate=raw_data['InvoiceDate'].max()

data=raw_data.groupby('CustomerID').apply(lambda x: pd.Series(dict(
    duration=(pd.to_datetime(x.InvoiceDate.max()) - pd.to_datetime(x.InvoiceDate.min())).days+1,
    Recency=(LastTransactionDate-x.InvoiceDate.max()).days,
    return_count=(x.Quantity < 0).sum(),
    purchase_count=(x.Quantity < 0).sum(),
    return_amount=(x.Quantity*x.UnitPrice < 0).sum(),
    purchase_amount=(x.UnitPrice*x.Quantity > 0).sum())))
data.head()

#pd.to_datetime(raw_data.InvoiceDate)).dt.days

Dataset has 541909 rows(samples) with 8 columns(features) each.


Unnamed: 0_level_0,Recency,duration,purchase_amount,purchase_count,return_amount,return_count
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
12346.0,325,1,1,1,1,1
12347.0,1,366,182,0,0,0
12348.0,74,283,31,0,0,0
12349.0,18,1,73,0,0,0
12350.0,309,1,17,0,0,0


In [2]:
#summary of dataset's distribution
import numpy as np
raw_data['CustomerID'] = raw_data['CustomerID'].astype(np.int64)
raw_data.describe()

Unnamed: 0,Quantity,UnitPrice,CustomerID
count,406829.0,406829.0,406829.0
mean,12.061303,3.460471,15287.69057
std,248.69337,69.315162,1713.600303
min,-80995.0,0.0,12346.0
25%,2.0,1.25,13953.0
50%,5.0,1.95,15152.0
75%,12.0,3.75,16791.0
max,80995.0,38970.0,18287.0


In [3]:
#items with negative quantity implies returned items, and 0 implies no purchase
raw_data[raw_data.Quantity<=0].head(5)

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
141,C536379,D,Discount,-1,2010-12-01 09:41:00,27.5,14527,United Kingdom
154,C536383,35004C,SET OF 3 COLOURED FLYING DUCKS,-1,2010-12-01 09:49:00,4.65,15311,United Kingdom
235,C536391,22556,PLASTERS IN TIN CIRCUS PARADE,-12,2010-12-01 10:24:00,1.65,17548,United Kingdom
236,C536391,21984,PACK OF 12 PINK PAISLEY TISSUES,-24,2010-12-01 10:24:00,0.29,17548,United Kingdom
237,C536391,21983,PACK OF 12 BLUE PAISLEY TISSUES,-24,2010-12-01 10:24:00,0.29,17548,United Kingdom


In [4]:
import preprocess_data as o
data,outliers,message=o.remove_outliers(data,False)
print (message)

ImportError: No module named 'preprocess_data'

In [None]:
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import numpy as np
%matplotlib inline

# Produce a scatter matrix for each pair of features in the data
pd.scatter_matrix(data, alpha = 0.3, figsize = (14,8), diagonal = 'kde');

data.corr()

In [None]:
import preprocess_data as o
data,outliers,message=o.remove_outliers(data,True)
print (message)

In [None]:
pd.scatter_matrix(data, alpha = 0.3, figsize = (14,8), diagonal = 'kde');

data.corr()

In [None]:
#NORMALIZE DATA 
import preprocess_data as o
n_data=o.normalize_data(data)
#print (message)
n_data.head(5)
n_data.describe()

In [None]:
pd.scatter_matrix(data, alpha = 0.3, figsize = (14,8), diagonal = 'kde');

n_data.corr()

In [None]:
import matplotlib.pyplot as plt
n_data.boxplot()

In [None]:
# Display a description of the dataset
display(data.describe())

import seaborn as sns
sns.set(color_codes=True)
products=list(data.columns.values)
for  x in products:
    plt.figure(x)
    sns.distplot(data[x]);
sns.pairplot(data);

In [None]:
# TODO: Make a copy of the DataFrame, using the 'drop' function to drop the given feature
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import accuracy_score
    
for  x in products:
    
    new_data = data.drop([x], axis = 1, inplace = False)
    target = data[x]

    # TODO: Split the data into training and testing sets(0.25) using the given feature as the target
    # Set a random state.
    X_train, X_test, y_train, y_test = train_test_split(new_data, target, test_size=0.25, random_state=12)

    # TODO: Create a decision tree regressor and fit it to the training set
    regressor = DecisionTreeRegressor()
    regressor.fit(X_train, y_train);

    # TODO: Report the score of the prediction using the testing set
    score = regressor.score(X_test,y_test)
    print('Score for ''\033[1m'' {} ''\033[0m'' as target feature: ''\033[1m'' {} ''\033[0m'''.format(x,score))


### Implementation: Outlier Detection
Detecting outliers in the data is extremely important in the data preprocessing step of any analysis. The presence of outliers can often skew results which take into consideration these data points. There are many "rules of thumb" for what constitutes an outlier in a dataset. Here, we will use [Tukey's Method for identfying outliers](http://datapigtechnologies.com/blog/index.php/highlighting-outliers-in-your-data-with-the-tukey-method/): An *outlier step* is calculated as 1.5 times the interquartile range (IQR). A data point with a feature that is beyond an outlier step outside of the IQR for that feature is considered abnormal.

In the code block below, you will need to implement the following:
 - Assign the value of the 25th percentile for the given feature to `Q1`. Use `np.percentile` for this.
 - Assign the value of the 75th percentile for the given feature to `Q3`. Again, use `np.percentile`.
 - Assign the calculation of an outlier step for the given feature to `step`.
 - Optionally remove data points from the dataset by adding indices to the `outliers` list.

**NOTE:** If you choose to remove any outliers, ensure that the sample data does not contain any of these points!  
Once you have performed this implementation, the dataset will be stored in the variable `good_data`.

In [None]:
# TODO: Apply PCA by fitting the good data with the same number of dimensions as features
good_data=data
from sklearn.decomposition import PCA
pca = PCA(copy=True, iterated_power='auto', n_components=6, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)
pca.fit(good_data)

# TODO: Transform log_samples using the PCA fit above

#pca_samples = pca.transform(log_samples)

# Generate PCA results plot
pca_results = vs.pca_results(good_data, pca)

print (pca_results['Explained Variance'].cumsum())

In [None]:
data.head()

In [None]:
f, axes = plt.subplots(2, 2, figsize=(7, 7), sharex=True)
sns.distplot( x , color="skyblue", ax=axes[0, 0])
sns.distplot( y , color="olive", ax=axes[0, 1])
sns.distplot(z , color="gold", ax=axes[1, 0])



In [None]:
sns.pairplot(data)

In [None]:
data.hist()

In [None]:
# Histogram
fig = plt.figure(figsize = (6,4))
title = fig.suptitle("Purchase Recency", fontsize=14)
fig.subplots_adjust(top=0.85, wspace=0.3)

ax = fig.add_subplot(1,1, 1)
ax.set_xlabel("rRecency")
ax.set_ylabel("Count") 
ax.text(1.2, 800, r'$\mu$='+str(round(data['recency'].mean(),2)), 
         fontsize=12)
freq, bins, patches = ax.hist(data['recency'], color='green', bins=15,
                                    edgecolor='black', linewidth=1)
                                    

# Density Plot
fig = plt.figure(figsize = (6, 4))
title = fig.suptitle("Purchase Recency", fontsize=14)
fig.subplots_adjust(top=0.85, wspace=0.3)

ax1 = fig.add_subplot(1,1, 1)
ax1.set_xlabel("Recency")
ax1.set_ylabel("Count") 
sns.kdeplot(data['recency'], ax=ax1, shade=True, color='steelblue')
                                    


In [None]:
# Histogram
fig = plt.figure(figsize = (6,4))
title = fig.suptitle("Purchase Frequency", fontsize=14)
fig.subplots_adjust(top=0.85, wspace=0.3)

ax = fig.add_subplot(1,1, 1)
ax.set_xlabel("Frequency")
ax.set_ylabel("Count") 
ax.text(1.2, 800, r'$\mu$='+str(round(data['frequency'].mean(),2)), 
         fontsize=12)
freq, bins, patches = ax.hist(data['frequency'], color='green', bins=15,
                                    edgecolor='black', linewidth=1)
                                    

# Density Plot
fig = plt.figure(figsize = (6, 4))
title = fig.suptitle("Purchase Frequency", fontsize=14)
fig.subplots_adjust(top=0.85, wspace=0.3)

ax1 = fig.add_subplot(1,1, 1)
ax1.set_xlabel("Frequency")
ax1.set_ylabel("Count") 
sns.kdeplot(data['frequency'], ax=ax1, shade=True, color='steelblue')
                                    

In [None]:
# Histogram
fig = plt.figure(figsize = (6,4))
title = fig.suptitle("Monetization", fontsize=14)
fig.subplots_adjust(top=0.85, wspace=0.3)

ax = fig.add_subplot(1,1, 1)
ax.set_xlabel("monetization")
ax.set_ylabel("Count") 
ax.text(1.2, 800, r'$\mu$='+str(round(data['monetization'].mean(),2)), 
         fontsize=12)
freq, bins, patches = ax.hist(data['monetization'], color='green', bins=15,
                                    edgecolor='black', linewidth=1)
                                    

# Density Plot
fig = plt.figure(figsize = (6, 4))
title = fig.suptitle("Monetization", fontsize=14)
fig.subplots_adjust(top=0.85, wspace=0.3)

ax1 = fig.add_subplot(1,1, 1)
ax1.set_xlabel("Monetization")
ax1.set_ylabel("Count") 
sns.kdeplot(data['monetization'], ax=ax1, shade=True, color='steelblue')

In [None]:
# Bar Plot
fig = plt.figure(figsize = (6, 4))
title = fig.suptitle("Recency Count", fontsize=14)
fig.subplots_adjust(top=0.85, wspace=0.3)

ax = fig.add_subplot(1,1, 1)
ax.set_xlabel("Frequency")
ax.set_ylabel("Count") 
w_q = data['frequency'].value_counts()
w_q = (list(w_q.index), list(w_q.values))
ax.tick_params(axis='both', which='major', labelsize=8.5)
bar = ax.bar(w_q[0], w_q[1], color='steelblue', 
        edgecolor='black', linewidth=1)

In [None]:
# Correlation Matrix Heatmap
f, ax = plt.subplots(figsize=(10, 6))
corr = data.corr()
hm = sns.heatmap(round(corr,2), annot=True, ax=ax, cmap="coolwarm",fmt='.2f',
                 linewidths=.05)
f.subplots_adjust(top=0.93)
t= f.suptitle('Correlation Heatmap', fontsize=14)

In [None]:
# Pair-wise Scatter Plots
cols = ['recency', 'frequency', 'monetization']
pp = sns.pairplot(data[cols], size=1.8, aspect=1.8,
                  plot_kws=dict(edgecolor="k", linewidth=0.5),
                  diag_kind="kde", diag_kws=dict(shade=True))

fig = pp.fig 
fig.subplots_adjust(top=0.93, wspace=0.3)
t = fig.suptitle('Pairwise Plots', fontsize=14)

In [None]:
def optimalK(data, nrefs=3, maxClusters=15):
    """
    Calculates KMeans optimal K using Gap Statistic from Tibshirani, Walther, Hastie
    Params:
        data: ndarry of shape (n_samples, n_features)
        nrefs: number of sample reference datasets to create
        maxClusters: Maximum number of clusters to test for
    Returns: (gaps, optimalK)
    """
    gaps = np.zeros((len(range(1, maxClusters)),))
    resultsdf = pd.DataFrame({'clusterCount':[], 'gap':[]})
    for gap_index, k in enumerate(range(1, maxClusters)):

        # Holder for reference dispersion results
        refDisps = np.zeros(nrefs)

        # For n references, generate random sample and perform kmeans getting resulting dispersion of each loop
        for i in range(nrefs):
            
            # Create new random reference set
            randomReference = np.random.random_sample(size=data.shape)
            
            # Fit to it
            km = KMeans(k)
            km.fit(randomReference)
            
            refDisp = km.inertia_
            refDisps[i] = refDisp

        # Fit cluster to original data and create dispersion
        km = KMeans(k)
        km.fit(data)
        
        origDisp = km.inertia_

        # Calculate gap statistic
        gap = np.log(np.mean(refDisps)) - np.log(origDisp)

        # Assign this loop's gap statistic to gaps
        gaps[gap_index] = gap
        
        resultsdf = resultsdf.append({'clusterCount':k, 'gap':gap}, ignore_index=True)

    return (gaps.argmax() + 1, resultsdf)  # Plus 1 because index of 0 means 1 cluster is optimal, index 2 = 3 clusters are optimal
    

In [None]:
k, gapdf = optimalK(data, nrefs=5, maxClusters=25)
print ('Optimal k is: ', k)

In [None]:
plt.plot(gapdf.clusterCount, gapdf.gap, linewidth=3)
plt.scatter(gapdf[gapdf.clusterCount == k].clusterCount, gapdf[gapdf.clusterCount == k].gap, s=250, c='r')
plt.grid(True)
plt.xlabel('Cluster Count')
plt.ylabel('Gap Value')
plt.title('Gap Values by Cluster Count')
plt.show()
