In [39]:
%matplotlib inline
from sklearn.cluster import KMeans
from sklearn.preprocessing import scale
from mpl_toolkits.mplot3d import Axes3D
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import inspect
import warnings
warnings.filterwarnings("ignore")
from sklearn.decomposition import PCA
import seaborn as sns
sns.set()
from sklearn.preprocessing import StandardScaler

In [7]:
pd.options.display.float_format = "{:,.2f}".format
np.set_printoptions(suppress=True)
color_theme = np.array(['darkgray','lightsalmon','powderblue','black','purple','green'])
#plt.ticklabel_format(style='plain');

In [38]:
# # read data from the csv
# april_df = pd.read_csv('april_df2.csv', index_col = False)

# # drop the unnecessary index column
# april_df.drop('Unnamed: 0', axis=1, inplace=True)

In [8]:
def elbow_plot(min_cluster,max_cluster,scaled_data):
    inertia = []
    for n_clusters in range(min_cluster,max_cluster):
        kmeans = KMeans(n_clusters=n_clusters, init='k-means++', random_state=10).fit(scaled_data).inertia_
        #inertia.append(kmeans/n_clusters )
        inertia.append(kmeans)
    inertias = pd.DataFrame({'n_clusters': range(min_cluster,max_cluster), 'inertia': inertia})
    ax = inertias.plot(x = 'n_clusters', y = 'inertia')
    plt.ylabel('Average Within Cluster Sq Distance')
    plt.ticklabel_format(style='plain');
    plt.show();

In [9]:
def histogram_printing(df,column,bins,ylim_list=[0,0],xlim_list=[0,0]): 
    plt.hist(df[column],bins = bins);
    if ylim_list != [0,0]:
        plt.ylim(ylim_list);
    if xlim_list != [0,0]:    
        plt.xlim(xlim_list);
    plt.xlabel(column);
    plt.ticklabel_format(style='plain');
    plt.show();

In [10]:
def transforming_columns(df,col_list):
    df[col_list] = df[col_list].apply(pd.to_numeric)
    for i in col_list:
        df['log_transform_{}'.format(i)] = np.log(df[i])
    return df

In [11]:
def scatter_plot(df,xcolumn,ycolumn):
    plt.scatter(df[xcolumn],df[ycolumn]);
    plt.ylabel(ycolumn);
    plt.xlabel(xcolumn);
    plt.ticklabel_format(style='plain');
    plt.show();

In [12]:
# rewrite this to create new columns with the scaled data
def scale_(df_with_columns):
    x = scale(df_with_columns)
    return x

In [13]:
def clustering(df,cluster_amounts,scaled_data):
    for i in cluster_amounts:
        clustering = KMeans(n_clusters=i, init='k-means++', random_state=10)
        clustering.fit(scaled_data)
        # putting the cluster values into the dataframe raw (i.e. 0,1,2,3)
        df['kmeans_{}'.format(i)] = clustering.labels_
        df['kmeans_{}_color'.format(i)] = color_theme[clustering.labels_]
    return df

In [14]:
def cluster_scatter(df,xcolumn,ycolumn,cluster_value):
    for i in cluster_value:
        f = plt.figure(figsize=(10,7))
        ax = f.add_subplot(221)
        ax2 = f.add_subplot(222)
        #print('Scatter plot on {} clustering\n'.format(i))
        #plt.scatter(x=df[column1],y=df[column2], c=color_theme[df['kmeans_{}'.format(i)]]);
        ax.scatter(x=df[xcolumn],y=df[ycolumn], c=df['kmeans_{}_color'.format(i)]);
        #ax.ylabel(ycolumn);
        ax.set_ylabel(ycolumn);
        ax.set_xlabel(xcolumn);
        #ax.xlabel(xcolumn);
        ax.ticklabel_format(style='plain');
        ax.title.set_text('Scatter plot on {} clustering\n'.format(i));
        #ax.show(); 
        for j in range(0,i):
            ax2.hist(x=df[df['kmeans_{}_color'.format(i)]==color_theme[j]][xcolumn],color = color_theme[j],bins = 200);
            ax2.set_xlabel(xcolumn);
            ax2.title.set_text('Histogram on {} clustering\n'.format(i));
        #ax2.show();

In [15]:
def description(df,column,cluster_value):
    #print(column)
    #temp = df.groupby('kmeans_{}_color'.format(cluster_value))[column].describe()
    temp_main = pd.DataFrame()
    for i in column:
        temp1 = df.groupby('kmeans_{}_color'.format(cluster_value))[i].describe()
        temp1 = temp1.assign( index = df.groupby('kmeans_{}_color'.format(cluster_value))[i].mean() / df[i].mean(),
                              percent =  temp1['count'] / sum(temp1['count']) * 100,
                              variable = i)
        temp_main = temp_main.append(temp1)
    return temp_main.sort_values(['variable','mean'])

In [16]:
def description_ver2(df,column,cluster_value):
    #print(column)
    #temp = df.groupby('kmeans_{}_color'.format(cluster_value))[column].describe()
    temp_main = pd.DataFrame()
    for i in column:
        temp1 = df.groupby('kmeans_{}_color'.format(cluster_value))[i].describe()
        temp1 = temp1.assign( index = df.groupby('kmeans_{}_color'.format(cluster_value))[i].mean() / df[i].mean(),
                              percent =  temp1['count'] / sum(temp1['count']) * 100,
                              variable = i )
        temp_main = temp_main.append(temp1)
    temp_main['cluster'] = temp_main.index
    #return temp_main.sort_values(['variable','mean'])
    return temp_main.sort_values(['percent','cluster','variable'])

In [17]:
def boxplot(df,column):
    bp = plt.boxplot(df[column])
    whiskers = [whiskers.get_ydata() for whiskers in bp["whiskers"]]
    #plt.ticklabel_format(style='plain');
    print(df[df[column] < whiskers[0][1]][column].count(),' users are below the lower bound value of ' , whiskers[0][1])
    print(df[df[column] > whiskers[1][1]][column].count(),' users are above the upper bound value of ' , whiskers[1][1])

In [18]:
# def export_clusters(fname, *args):
#     frames1 = [item for item in args]
#     with open(fname, mode='a+') as f:
#         for df in frames1:
#             df.to_csv(fname, mode='a', header = f.tell() == 0)
#             f.write('\n')
# def export_clusters(fname, column, *args):
#     frames1 = [item for item in args]
#     with open(fname, mode='a+') as f:
#         for df in frames1:
#             f.write(column)
#             df.to_csv(fname, mode='a', header = f.tell() == 0)
#             f.write('\n')
            
def export_clusters(fname, column, *args):
    frames1 = [item for item in args]
    with open(fname, mode='a+') as f:
        for df in frames1:
            f.write('\n')
            f.write(column)
            f.write('\n')
            df.to_csv(fname, mode='a', header = f.tell() == 0)
            f.write('\n')

In [19]:
# # practice df
# df = pd.DataFrame(columns=['aa','b','c','d'], index=['x','y','z','m','n','o','p','h','v','xx','yy','zz','mm','nn','oo','pp','hh','vv'])
# df.loc['x'] = pd.Series({'aa':1, 'b':5, 'c':2, 'd':3})
# df.loc['y'] = pd.Series({'aa':45, 'b':35, 'c':21, 'd':23})
# df.loc['z'] = pd.Series({'aa':10, 'b':59, 'c':22, 'd':544})
# df.loc['m'] = pd.Series({'aa':18, 'b':559, 'c':232, 'd':774})
# df.loc['n'] = pd.Series({'aa':15, 'b':59, 'c':222, 'd':554})
# df.loc['o'] = pd.Series({'aa':12, 'b':5, 'c':2, 'd':546})
# df.loc['p'] = pd.Series({'aa':20, 'b':9, 'c':20, 'd':5})
# df.loc['h'] = pd.Series({'aa':1, 'b':69, 'c':27, 'd':544})
# df.loc['v'] = pd.Series({'aa':20, 'b':529, 'c':72, 'd':34})
# df.loc['xx'] = pd.Series({'aa':321, 'b':54, 'c':22, 'd':322})
# df.loc['yy'] = pd.Series({'aa':412, 'b':35, 'c':231, 'd':23})
# df.loc['zz'] = pd.Series({'aa':1220, 'b':359, 'c':22, 'd':2474})
# df.loc['mm'] = pd.Series({'aa':183, 'b':559, 'c':732, 'd':774})
# df.loc['nn'] = pd.Series({'aa':125, 'b':529, 'c':2292, 'd':554})
# df.loc['oo'] = pd.Series({'aa':122, 'b':5, 'c':22, 'd':546})
# df.loc['pp'] = pd.Series({'aa':1330, 'b':93, 'c':20, 'd':5})
# df.loc['hh'] = pd.Series({'aa':1422, 'b':69, 'c':427, 'd':544})
# df.loc['vv'] = pd.Series({'aa':1200, 'b':329, 'c':72, 'd':4})

In [29]:
# df.dtypes

In [30]:
# scaled_data = scale_(df_with_columns = df[['aa','b','c']])

In [31]:
#elbow_plot(min_cluster=2,max_cluster=10,scaled_data=scaled_data)

In [32]:
# histogram_printing(df,column='aa',bins = 300,ylim_list = [0,5],xlim_list = [0,600])

In [33]:
# df = transforming_columns(df,col_list = ['aa','b','c','d'])

In [34]:
# scatter_plot(df,xcolumn = 'b',ycolumn = 'log_transform_aa')

In [35]:
# df = clustering(df,cluster_amounts = [3,4,5],scaled_data=scaled_data)

In [36]:
# cluster_scatter(df,xcolumn = 'aa',ycolumn = 'b',cluster_value = [3,4])

In [37]:
# description(df,column = ['aa','b','c'],cluster_value = 3)