# Color Your Emotion

The purpose of this IPython notebook is to provide a markdown document for the code written for the insight project titled with ***"color your emotion"***. 

Description and report regarding this project can be found at **http://coloryouremotion.com**

Any question or concern is welcome to forward to **lanyiyun@gmail.com**

- Author: Yiyun Lan
- Date: Jun, 2016

===========================================================================================================
## An Overview of Code Structure


- The file consists four modules:
  - module 1: data preprocessing (dataprep.py)
  - module 2: explorative analysis for the event data (events.py)
  - module 3: explorative analysis for the user data (users.py)
  - module 4: prediction model (prediction.py)



===========================================================================================================
## Module 1: data preprocessing (dataprep.py)

- Description: module 1 provides defination of two classes: users and events. Instances created from these two classes                take raw data as input and will return a clean data frame that contains user and event information.



- Class Users
  - description: provide a NULL-free data frame for user information
  - methods: _read\_data_, _clean\_df_, dataprep, describe\_null, load, save


- Class Events
  - description: provide a NULL-free data frame for event information
  - methods: _read\_JSON_, _clean\_df_, _extract\_data_, dataprep, describe\_null, load, save
  

In [28]:
'''''''''''''''''''''''''''''''''
  Module for data preprocessing
'''''''''''''''''''''''''''''''''
import json
import pandas as pd
import pickle
import numpy as np
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
import psycopg2


In [112]:
class Users:

    def __init__(self, school):
        '''
        Args:
             school - school name, e.g. 'Stanbridge', 'Brooklyn', 'NewEn', etc (string)
        '''
        self.df = pd.DataFrame()
        self.school = school
        self.file_location = '/Users/lanyiyun/Documents/Insight Project/Data/Consolidated_' + school + '.xlsx'

        #Define a database name
        self.dbname = 'user_db'
        self.username = 'lanyiyun'
        self.pswd = 'lanyiyun'

        #a connection to a database
        self.engine = create_engine('postgresql://%s:%s@localhost/%s'%(self.username,self.pswd,self.dbname))

    def _read_data_(self):
        self.df = pd.read_excel(self.file_location)

    def _clean_df_(self, remove_NULL=False, how='any', fill_NULL=False, fill_value=0):
        if remove_NULL:
            self.df = self.df.dropna(0, how)
            if fill_NULL:
                self.df = self.df.fillna(fill_value, 0)

    def dataprep(self, remove_NULL=False, how='any', fill_NULL=False, fill_value=0):
        '''
        data preprocessing
        Args: (optional)
             remove_NULL: False, not remove missing value; True, remove all missing values
             how: - approach to remove missing values, 'any' or 'all'
             fill_NULL: False, not fill missing data; True, fill missing data
             fill_value: value used to fill the missing data
        '''
        if not remove_NULL:
            self._read_data_()
        self._clean_df_(remove_NULL, how, fill_NULL, fill_value)

    # list the number of missing data for each event
    def describe_null(self):
        notnull = [len(self.df)-i for i in self.df.count(0)]
        notnull_ptg = [1.0*(len(self.df)-i)/len(self.df) for i in self.df.count(0)]
        #self.desnull_df =pd.DataFrame(notnull_ptg, columns = self.df.columns)
        missing_df = pd.DataFrame({'total missing values': notnull,
                                   'missing percentage': notnull_ptg,
                                   'data type': self.df.dtypes},
                                    index = self.df.columns.tolist())
        
        return missing_df

    # load pickle data
    def load(self):
        '''
        Loads existing cleaned data from a pickled dataframe
        '''
        con = None
        con = psycopg2.connect(database = self.dbname, user = self.username, host='localhost', password=self.pswd)

        # query:
        sql_query = """
                    SELECT * FROM user_table;
                    """
        user_data_from_sql = pd.read_sql_query(sql_query,con)

        return user_data_from_sql


    # save data to pickle
    def save(self, create_or_append):
        '''
        save data to database and to DataFrame pickle. User can choose
        whether to append to or overwrite the database table, but the
        DataFrame pickle will always be overwritten

        Args:
            create_or_append: 'create' or 'append'. Applies only to database table
            DataFrame pickle will always be overwritten (string)
        '''

        # Pickle DataFrame as extra backup
        pickle.dump(self.df, open('user_df.pickle', "w"))

        # Write to database
        ## create a database (if it doesn't exist)
        if not database_exists(self.engine.url):
            create_database(self.engine.url)

        self.df.to_sql('user_table', self.engine, if_exists='replace')



In [107]:
class Events:

    def __init__(self, school):
        '''
        Args:
             school - school name, e.g. 'Stanbridges', 'Brooklyn', 'NewEn', etc (string)
        '''
        self.df = pd.DataFrame(columns=['user_id',
                                        'browser',
                                        'city',
                                        'country',
                                        'device_type',
                                        'domain',
                                        'landing_page',
                                        'event_name',
                                        'time',
                                        'session_time',
                                        'path',
                                        'platform'])
        self.school = school
        self.file_location = '/Users/lanyiyun/Documents/Insight Project/Data/' + school + '/events.json'

        #Define a database name
        self.dbname = 'emote_db'
        self.username = 'lanyiyun'
        self.pswd = 'lanyiyun'

        #a connection to a database
        self.engine = create_engine('postgresql://%s:%s@localhost/%s'%(self.username,self.pswd,self.dbname))

    # read JSON data
    def _read_JSON_(self):
        data_events = []
        with open(self.file_location) as f:
            for line in f:
                data_events.append(json.loads(line))
        data_events = pd.read_json(json.dumps(data_events))
        return data_events

    # convert JSON data into dataframe
    def _extract_data_(self):

        df_to_be_extracted = self._read_JSON_()

        try:
            self.df['user_id'] =  [df_to_be_extracted['data'][i]['user_id'] if 'user_id' in
                                   df_to_be_extracted['data'][i] else np.nan for i in range(len(df_to_be_extracted))]
            self.df['browser'] =  [df_to_be_extracted['data'][i]['browser'] if 'browser' in
                                   df_to_be_extracted['data'][i] else np.nan for i in range(len(df_to_be_extracted))]
            self.df['city'] =  [df_to_be_extracted['data'][i]['city'] if 'city' in
                                df_to_be_extracted['data'][i] else np.nan for i in range(len(df_to_be_extracted))]
            self.df['country'] =  [df_to_be_extracted['data'][i]['country'] if 'country' in
                                   df_to_be_extracted['data'][i] else np.nan for i in range(len(df_to_be_extracted))]
            self.df['device_type'] =  [df_to_be_extracted['data'][i]['device_type'] if 'device_type' in
                                       df_to_be_extracted['data'][i] else np.nan for i in range(len(df_to_be_extracted))]
            self.df['domain'] =  [df_to_be_extracted['data'][i]['domain'] if 'domain' in
                                  df_to_be_extracted['data'][i] else np.nan for i in range(len(df_to_be_extracted))]
            self.df['landing_page'] =  [df_to_be_extracted['data'][i]['landing_page'] if 'landing_page' in
                                        df_to_be_extracted['data'][i] else np.nan for i in range(len(df_to_be_extracted))]
            self.df['event_name'] =  [df_to_be_extracted['name'][i] for i in range(len(df_to_be_extracted))]
            self.df['time'] =  [df_to_be_extracted['data'][i]['time'] if 'time' in
                                df_to_be_extracted['data'][i] else np.nan for i in range(len(df_to_be_extracted))]
            self.df['session_time'] =  [df_to_be_extracted['data'][i]['session_time'] if 'session_time' in
                                        df_to_be_extracted['data'][i] else np.nan for i in range(len(df_to_be_extracted))]
            self.df['path'] =  [df_to_be_extracted['data'][i]['path'] if 'path' in df_to_be_extracted['data'][i]
                                else np.nan for i in range(len(df_to_be_extracted))]
            self.df['platform'] =  [df_to_be_extracted['data'][i]['platform'] if 'platform' in
                                    df_to_be_extracted['data'][i] else np.nan for i in range(len(df_to_be_extracted))]
        except:
            print("\nError encountered!\n")

    # clean data: missing values, duplicate events, tester events.
    def _clean_df_(self, remove_NULL=False, how='any', fill_NULL=False, fill_value=0):
        self.df['event_name'] = self.df['event_name'].replace(to_replace = 'View Emote Detail',
                                                              value = 'Viewed Emote detail')
        self.df = self.df[self.df.event_name != 'Viewer']
        if remove_NULL:
            self.df = self.df.dropna(0, how)
            if fill_NULL:
                self.df = self.df.fillna(fill_value, 0)

        return self.df

    def dataprep(self, remove_NULL=False, how='any', fill_NULL=False, fill_value=0):
        '''
        data preprocessing
        Args: (optional)
             remove_NULL: False, not remove missing value; True, remove all missing values
             how: - approach to remove missing values, 'any' or 'all'
             fill_NULL: '0', not fill missing data; '1', fill missing data
             fill_value: value used to fill the missing data
        '''
        if not remove_NULL:
            self._extract_data_()
        self._clean_df_(remove_NULL, how, fill_NULL, fill_value)

    # list the number of missing data for each event
    def describe_null(self):
        notnull = [len(self.df)-i for i in self.df.count(0)]
        notnull_ptg = [1.0*(len(self.df)-i)/len(self.df) for i in self.df.count(0)]
        #self.desnull_df =pd.DataFrame(notnull_ptg, columns = self.df.columns)
        missing_df = pd.DataFrame({'total missing values': notnull,
                                   'missing percentage': notnull_ptg,
                                   'data type': self.df.dtypes},
                                    index = self.df.columns.tolist())
        return missing_df

    # load pickle data
    def load(self):
        '''
        Loads existing cleaned data from a pickled dataframe
        '''
        # connect:
        con = None
        con = psycopg2.connect(database = self.dbname, user = self.username, host='localhost', password=self.pswd)

        # query:
        sql_query = """
                    SELECT * FROM event_table;
                    """
        event_data_from_sql = pd.read_sql_query(sql_query,con)

        return event_data_from_sql

    # save data to pickle/database
    def save(self):
        '''
        save data to database and to DataFrame pickle.
        Args:
            DataFrame pickle will always be overwritten (string)
        '''

        # Pickle DataFrame as extra backup
        pickle.dump(self.df, open('event_df.pickle', "w"))

        # Write to database
        ## create a database (if it doesn't exist)
        if not database_exists(self.engine.url):
            create_database(self.engine.url)

        self.df.to_sql('event_table', self.engine, if_exists='replace')


*** ---> Create an instance for event***

In [8]:
s = Dataprep('Stanbridges')

*** ---> dataprep() extract event data from raw JSON file ***

In [9]:
s.dataprep()

In [10]:
s.df.head()

Unnamed: 0,user_id,browser,city,country,device_type,domain,landing_page,event_name,time,session_time,path,platform
0,4305745796442202,Chrome 48.0.2564,Lewes,United States,Desktop,stanbridge.emotenow.com,stanbridge.emotenow.com/teacher/,Modified time of Emote,1454747666148,1454744465860,/back/bell-schedules,Mac OS X 10.10.5
1,4305745796442202,Chrome 48.0.2564,Lewes,United States,Desktop,stanbridge.emotenow.com,stanbridge.emotenow.com/teacher/,Modified time of Emote,1454747674196,1454744465860,/back/bell-schedules,Mac OS X 10.10.5
2,4305745796442202,Chrome 48.0.2564,Lewes,United States,Desktop,stanbridge.emotenow.com,stanbridge.emotenow.com/teacher/,Modified time of Emote,1454747680207,1454744465860,/back/bell-schedules,Mac OS X 10.10.5
3,4305745796442202,Chrome 48.0.2564,Lewes,United States,Desktop,stanbridge.emotenow.com,stanbridge.emotenow.com/teacher/,Modified time of Emote,1454747682213,1454744465860,/back/bell-schedules,Mac OS X 10.10.5
4,4305745796442202,Chrome 48.0.2564,Lewes,United States,Desktop,stanbridge.emotenow.com,stanbridge.emotenow.com/teacher/,Modified time of Emote,1454747694243,1454744465860,/back/bell-schedules,Mac OS X 10.10.5


*** ---> describe_null() provides a description of missing values***

In [12]:
s.describe_null()

Unnamed: 0,data type,missing percentage,total missing values
user_id,int64,0.0,0
browser,object,0.011522,909
city,object,0.067042,5289
country,object,0.012181,961
device_type,object,0.011522,909
domain,object,0.003828,302
landing_page,object,0.025111,1981
event_name,object,0.0,0
time,object,0.0,0
session_time,object,0.011522,909


*** ---> By setting up remove_NULL to True, dataprep() will remove all the missing values ***

In [13]:
s.dataprep(remove_NULL=True)

In [14]:
s.describe_null()

Unnamed: 0,data type,missing percentage,total missing values
user_id,int64,0.0,0
browser,object,0.0,0
city,object,0.0,0
country,object,0.0,0
device_type,object,0.0,0
domain,object,0.0,0
landing_page,object,0.0,0
event_name,object,0.0,0
time,object,0.0,0
session_time,object,0.0,0


***---> save() function will store all the data to both pickle and SQL ***

In [15]:
s.save()

In [19]:
event_pickle = pickle.load(open( "event_df.pickle", "rb" ))

In [17]:
s.load()

Unnamed: 0,index,user_id,browser,city,country,device_type,domain,landing_page,event_name,time,session_time,path,platform
0,0,4305745796442202,Chrome 48.0.2564,Lewes,United States,Desktop,stanbridge.emotenow.com,stanbridge.emotenow.com/teacher/,Modified time of Emote,1454747666148,1454744465860,/back/bell-schedules,Mac OS X 10.10.5
1,1,4305745796442202,Chrome 48.0.2564,Lewes,United States,Desktop,stanbridge.emotenow.com,stanbridge.emotenow.com/teacher/,Modified time of Emote,1454747674196,1454744465860,/back/bell-schedules,Mac OS X 10.10.5
2,2,4305745796442202,Chrome 48.0.2564,Lewes,United States,Desktop,stanbridge.emotenow.com,stanbridge.emotenow.com/teacher/,Modified time of Emote,1454747680207,1454744465860,/back/bell-schedules,Mac OS X 10.10.5
3,3,4305745796442202,Chrome 48.0.2564,Lewes,United States,Desktop,stanbridge.emotenow.com,stanbridge.emotenow.com/teacher/,Modified time of Emote,1454747682213,1454744465860,/back/bell-schedules,Mac OS X 10.10.5
4,4,4305745796442202,Chrome 48.0.2564,Lewes,United States,Desktop,stanbridge.emotenow.com,stanbridge.emotenow.com/teacher/,Modified time of Emote,1454747694243,1454744465860,/back/bell-schedules,Mac OS X 10.10.5
5,5,4305745796442202,Chrome 48.0.2564,Lewes,United States,Desktop,stanbridge.emotenow.com,stanbridge.emotenow.com/teacher/,Modified time of Emote,1454747698246,1454744465860,/back/bell-schedules,Mac OS X 10.10.5
6,6,4305745796442202,Chrome 48.0.2564,Lewes,United States,Desktop,stanbridge.emotenow.com,stanbridge.emotenow.com/teacher/,Modified time of Emote,1454747704256,1454744465860,/back/bell-schedules,Mac OS X 10.10.5
7,7,4305745796442202,Chrome 48.0.2564,Lewes,United States,Desktop,stanbridge.emotenow.com,stanbridge.emotenow.com/teacher/,Modified time of Emote,1454747706275,1454744465860,/back/bell-schedules,Mac OS X 10.10.5
8,8,4305745796442202,Chrome 48.0.2564,Lewes,United States,Desktop,stanbridge.emotenow.com,stanbridge.emotenow.com/teacher/,Modified time of Emote,1454747716290,1454744465860,/back/bell-schedules,Mac OS X 10.10.5
9,9,4305745796442202,Chrome 48.0.2564,Lewes,United States,Desktop,stanbridge.emotenow.com,stanbridge.emotenow.com/teacher/,Modified time of Emote,1454747718387,1454744465860,/back/bell-schedules,Mac OS X 10.10.5


In [113]:
u = Users('Stanbridge')

In [116]:
u.dataprep()

In [117]:
u.describe_null()

Unnamed: 0,data type,missing percentage,total missing values
_id,object,0.0,0
_idstudent,object,0.0,0
_idemotion,object,0.0,0
_f_intensity,int64,0.0,0
detail,object,0.746413,3017
_dt,object,0.0,0
_iduser,object,0.0,0
firstName,object,0.0,0
lastName,object,0.0,0
gender,object,0.0,0


In [119]:
u.dataprep(remove_NULL=True)
u.describe_null()

Unnamed: 0,data type,missing percentage,total missing values
_id,object,0.0,0
_idstudent,object,0.0,0
_idemotion,object,0.0,0
_f_intensity,int64,0.0,0
detail,object,0.0,0
_dt,object,0.0,0
_iduser,object,0.0,0
firstName,object,0.0,0
lastName,object,0.0,0
gender,object,0.0,0




    

## Module 2:  explorative analysis for the event data (events.py)

    
- Description: Module 2 provides function to draw traffic of user-app interaction as well as segmenting users



- Functions: 
  - find\_unique\_event, extract\_event, map\_event
  - segment\_emote\_users, find\_active\_IDs, segment\_active\_users
  - define\_webnet, draw\_webnet
  - cal\_stay\_time, draw\_stay\_time





In [40]:
'''''''''''''''''''''''''''''''''''''''
 Module for event explorative analysis
'''''''''''''''''''''''''''''''''''''''
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt

In [51]:
# find unique event name

def find_unique_event(df):
    return list(set(df['event_name']))

In [52]:
# extract event list from users in chronological order (event type can repeat in this list)

def extract_event(df):
    event_data= []
    n = 0
    event_list = df['event_name']
    if len(event_list) == 1:
        return event_list.tolist()
    else:
        for i, j in zip(event_list, event_list[1:].tolist() + event_list[:1].tolist()):
            n += 1 # count the number of iteration
            if i != j:
                if len(event_data) == 0:   # if it's the first iteration
                    event_data.append(i)
                    event_data.append(j)
                elif n != len(event_list): # if it's not the last iteration
                    event_data.append(j)
        return event_data

In [65]:
# function to create pairwise event links

def map_events(df, user_type = 'active'):
    '''
    Args:
        df: dataframe for users who submitted Emote
        user_type: 'active' or 'nonactive'
    '''
    [user_not_submit_emote, user_submit_emote] = segment_emote_users(df)
    [active_id, nonactive_id] = find_active_IDs(user_submit_emote)
    [active_df, nonactive_df] = segment_active_users(df, active_id, nonactive_id)

    if user_type == 'active':
        active_by_session = active_df.groupby('session_time').groups
        session_graph = []
        unique_event = find_unique_event(active_df)
        for i in active_by_session:
            if len(active_by_session[i]) > 1:
                current_events = active_df.loc[active_by_session[i]]
                session_events = extract_event(current_events)
                session_eventTOpos = [unique_event.index(j) for j in session_events]
                session_graph_temp = [(session_eventTOpos[k],session_eventTOpos[k+1]) for k in range(len(session_eventTOpos)-1)]
                session_graph = session_graph + session_graph_temp
    elif user_type == 'nonactive':
        nonactive_by_session = nonactive_df.groupby('session_time').groups
        session_graph = []
        unique_event = find_unique_event(nonactive_df)
        for i in nonactive_by_session:
            if len(nonactive_by_session[i]) > 1:
                current_events = nonactive_df.loc[nonactive_by_session[i]]
                session_events = extract_event(current_events)
                session_eventTOpos = [unique_event.index(j) for j in session_events]
                session_graph_temp = [(session_eventTOpos[k],session_eventTOpos[k+1]) for k in range(len(session_eventTOpos)-1)]
                session_graph = session_graph + session_graph_temp
                
    return session_graph

In [83]:
# segment active user data versus non-active user data

def segment_emote_users(df):
    
    df_user_id = df.groupby('user_id').groups
    users_not_submit_emote = 0
    unique_event = find_unique_event(df)
    
    for i in df_user_id.values():
        # event 23 is "submit emote"
        NumOfSubmitEmote = np.sum(df.loc[i]['event_name']=='submit_emote')
        if NumOfSubmitEmote==0:
            try:
                users_not_submit_emote = users_not_submit_emote.append(df.loc[i])
            except:
                users_not_submit_emote = df.loc[i]
    users_submit_emote = df.loc[list(set(df.index.values) - set(users_not_submit_emote.index.values))]

    return users_not_submit_emote, users_submit_emote

In [55]:
# find active and non_active users ID

def find_active_IDs(df):
    '''
    Args:
        df: users who submitted emote
    '''
    emote_IDs = df.groupby('user_id')
    NumOfSessions = {}
    for i in emote_IDs.groups.keys():
        NumOfSessions[i] = len(emote_IDs.get_group(i).groupby('session_time').groups)
        df_sessions = pd.DataFrame([NumOfSessions.keys(), NumOfSessions.values()]).transpose()
        df_sessions = df_sessions.sort_values(by = 1, ascending = False)
    active_user_ID = df_sessions.iloc[0:11][0]
    non_active_user_ID = df_sessions.iloc[11:][0]
    
    return active_user_ID, non_active_user_ID

In [56]:
# segment active user data versus non-active user data

def segment_active_users(users_submit_emote, active_user_ID, non_active_user_ID):
    active_df = 0
    nonactive_df = 0
    for i in range(len(active_user_ID)):
        try:
            active_df = active_df.append(users_submit_emote[users_submit_emote.user_id == active_user_ID.iloc[i]])
        except:
            active_df = users_submit_emote[users_submit_emote.user_id == active_user_ID.iloc[i]]

    for i in range(len(non_active_user_ID)):
        try:
            nonactive_df = nonactive_df.append(users_submit_emote[users_submit_emote.user_id == non_active_user_ID.iloc[i]])
        except:
            nonactive_df = users_submit_emote[users_submit_emote.user_id == non_active_user_ID.iloc[i]]
    
    return active_df, nonactive_df

In [57]:
# function to define user-app interaction network: layout, nodes, edges, etc

def define_webnet(G, graph, dict_eoccur, edge_color, interested_node = {},
               node_size = 1000, labels=None, graph_layout='shell',
               node_color='black', node_alpha=0.3,
               node_text_size=20,
               edge_alpha=0.3, edge_tickness=10,
               edge_text_pos=0.3,
               text_font='sans-serif', cmap = plt.cm.Reds):
    '''
    Args:
        G: networkx graph object
        graph:
        dict_eoccur:
        interested_node:
    '''
    # add edges
    for edge in graph:
        G.add_edge(edge[0], edge[1])

    # node layout options
    if graph_layout == 'spring':
        graph_pos=nx.spring_layout(G)
    elif graph_layout == 'spectral':
        graph_pos=nx.spectral_layout(G)
    elif graph_layout == 'random':
        graph_pos=nx.random_layout(G)
    elif graph_layout == 'circular':
        graph_pos=nx.circular_layout(G)
    else:
        graph_pos=nx.shell_layout(G)

    # scale edge width to show traffic volume
    edge_width = {}
    for i in G.edges():
        if i in dict_eoccur.keys():
            edge_width[i] = dict_eoccur[i]
    edge_width = [edge_width[i] for i in G.edges()]
    edge_width = [10.0*edge_width[i]/max(edge_width) for i in range(len(edge_width))]

    # highlight interested traffic by redefining edge_width
    if len(interested_node):
        interested_df = pd.DataFrame([edge_width, G.edges()]).transpose()

        for i in range(len(interested_df)):
            if not len(interested_node.intersection(interested_df[1][i])):
                interested_df[0][i] = 0
            if 3 in interested_df[1][i] or 7 in interested_df[1][i] or 37 in interested_df[1][i]:
                interested_df[0][i] = 0

        edge_width = interested_df[0]
        # convert to conditional values
        edge_width = [50*edge_width[i]/sum(edge_width) for i in range(len(edge_width))]

    # draw graph
    nx.draw_networkx_nodes(G,graph_pos,node_size = node_size,
                           alpha=node_alpha, node_color=node_color, cmap = cmap)
    nx.draw_networkx_edges(G,graph_pos,width=edge_width,
                           alpha=edge_alpha, edge_color=edge_color, arrows = True)
    nx.draw_networkx_labels(G, graph_pos, font_size=node_text_size,
                            font_family=text_font)

    # label nodes
    if labels is None:
        labels = range(len(graph))

    # label edges
    edge_labels = dict(zip(graph, labels))

In [73]:
# function to draw user-app interaction network: layout, nodes, edges, etc

def draw_webnet(df, user_type = 'active', traffic_direction = True, interested_node = {}):
    '''
    Args:
        df: clean dataframe for the school
        user_type: 'active': active users
                   'nonactive': nonactive users
        traffic_direction: True: positive direction (e.g., 9->13),
                           False: negative direction (e.g., 13->9)
    '''
    session_graph = map_events(df, user_type)

    # create networkx graph
    G=nx.Graph()

    # count eventTOevent occurance
    event_occurance = [[i,session_graph.count(i)] for i in set(session_graph)]

    # change to dict
    dict_eoccur = {}
    for i in event_occurance:
        dict_eoccur[i[0]] = i[1]

    # plot traffic
    dict_positive = {}
    dict_negative = {}
    for i in dict_eoccur.keys():
        if i[0]<i[1]:  # positive direction
            dict_positive.update({i:dict_eoccur[i]})
        elif i[0]>i[1]:  # positive direction
            dict_negative.update({i:dict_eoccur[i]})

    # split bi-directional traffic
    positive_graph = [session_graph[i] for i in range(len(session_graph)) 
                      if session_graph[i][0] < session_graph[i][1]]
    negative_graph = [session_graph[i] for i in range(len(session_graph)) 
                      if session_graph[i][0] > session_graph[i][1]]

    # swap tuple element in the negative case for the matching later
    dict_negative_revise = {}
    for i in range(len(dict_negative.keys())):
        dict_negative_revise[(dict_negative.keys()[i][1],dict_negative.keys()[i][0])] = dict_negative.values()[i]

    # draw the network
    if traffic_direction == True:
        define_webnet(G, positive_graph, dict_eoccur = dict_positive, interested_node = interested_node, 
                      graph_layout = 'shell', edge_color='blue')
    elif traffic_direction == False:
        define_webnet(G, negative_graph, dict_eoccur = dict_negative_revise, interested_node = interested_node,
                      graph_layout = 'shell', edge_color='red')

    frame = plt.gca()
    frame.axes.get_xaxis().set_visible(False)
    frame.axes.get_yaxis().set_visible(False)
    plt.show()


In [81]:
# calculate stay_time in each event, key: event index, value: time in seconds

def cal_stay_time(df, user_type = 'active'):
    '''
    Args:
        df: data frames of users who submitted emote
        user_type: 'active' or 'nonactive'
    '''
    stay_time = {}
    [user_not_submit_emote, user_submit_emote] = segment_emote_users(df)

    if user_type == 'active':
        df_user = user_submit_emote[:11]
    elif user_type == 'nonactive':
        df_user = user_not_submit_emote[11:]

    unique_event = find_unique_event(df_user)
    submit_user_events = df_user.groupby('event_name').groups.keys()
        
    for i in unique_event:
        # if the event is in the list of user interaction
        if i in submit_user_events:
            df_event_name = df_user.groupby('event_name').groups
            df_event_in_session = df_user.loc[df_event_name[i]].groupby('session_time').groups
            # sum all the time
            for j in range(len(df_event_in_session)):
                time = df_user.loc[df_event_in_session.values()[j][-1]]['time']
                session_time = df_user.loc[df_event_in_session.values()[j][-1]]['session_time']
                if j == 0:
                    total_session_time = np.floor(0.001*(int(time) - int(session_time)))
                else:
                    total_session_time += np.floor(0.001*(int(time) - int(session_time)))
            stay_time[unique_event.index(i)] = total_session_time
        # if not, assign nan
        else:
            stay_time[unique_event.index(i)] = np.nan

    return stay_time


In [74]:
# function to draw time spent on each event for both active and non-active users

def draw_stay_time(df):
    stay_time_active = cal_stay_time(df, user_type='active')
    stay_time_nonactive = cal_stay_time(df, user_type='nonactive')
    activeMeans = [stay_time_active.values()[i]/np.nansum(stay_time_active.values()) 
                   for i in range(len(stay_time_active))]
    nonactiveMeans = [stay_time_nonactive.values()[i]/np.nansum(stay_time_nonactive.values()) 
                      for i in range(len(stay_time_nonactive))]

    df_active = pd.DataFrame([activeMeans, nonactiveMeans]).transpose()
    df_active_sort = df_active.sort_values(by= 0, ascending = False)
    df_active_sort = df_active_sort.fillna(value=0)

    ind = np.arange(51)  # the x locations for the groups
    width = 0.35       # the width of the bars

    fig, ax = plt.subplots()
    rects1 = ax.bar(ind, df_active_sort[0], width, color='b')

    rects2 = ax.bar(ind + width, df_active_sort[1], width, color='g')
    plt.xticks(range(len(df_active_sort)), df_active_sort.index)

    coefficients = np.polyfit(range(len(df_active_sort)), df_active_sort[0], deg = 6)
    polynomial = np.poly1d(coefficients)
    xs = np.arange(0, 51, 1)
    ys = polynomial(xs)
    line1, = plt.plot(xs, ys, linewidth = 2)

    coefficients = np.polyfit(range(len(df_active_sort)), df_active_sort[1], deg = 6)
    polynomial = np.poly1d(coefficients)
    xs = np.arange(0, 51, 1)
    ys = polynomial(xs)
    line2, = plt.plot(xs, ys, linewidth = 2)
    
    plt.legend([line1, line2], ['Active Users', 'NonActive Users'], fontsize = 14)
    plt.ylabel('stay time (%)')
    plt.xlabel('event number')
    plt.show()

** draw_webnet is the star function in the 2nd module. It basically utilizes other functions (segment, map events etc) to draw the net flow of different events for active users and nonactive users. By defining user_type and traffic_direction, four net flow figures will be generated for comparison. **

In [69]:
draw_webnet(s.df, user_type='active', traffic_direction=True, interested_node={})

** draw_stay_time draws the average time spent on each event by active users and nonactive users. **

In [75]:
draw_stay_time(s.df)

## Module 3:  explorative analysis for the user data (users.py)

- Description: Module 3 draws user retention of two user groups (submit an emote versus not submit an emote)


- Functions: cal\_user\_retention, draw\_user\_retention


In [87]:
'''''''''''''''''''''''''''''''''''''''
 Module for user explorative analysis
'''''''''''''''''''''''''''''''''''''''
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
from pylab import *

In [99]:
# aligned array addition (same length)
def add_array(a, b):
    if len(a) < len(b):
        c = b.copy()
        c[:len(a)] += a
    else:
        c = a.copy()
        c[:len(b)] += b
    return c

# create array for user retention
def user_array(df):
    for i in range(len(df)):
        if i == 0:
            sum = np.ones(np.ceil(df)[i])
        else:
            sum = add_array(sum,np.ones(np.ceil(df)[i]))
    return sum

# normalize user data
def user_normalize(raw):
    return [float(i)/raw[0] for i in raw]

In [96]:
# sort events per user in choronological order

def user_retention(df, sign = 1):
    # df is the segmented users: submit an emote VS not submit an emote.
    # sign = 0 is user not submit; sign = 1 default, is user who submit an emote
    time_first_seen = {}
    time_first_emote = {}
    time_last_seen = {}
    NumOfEmoteUser = len(df.groupby('user_id'))

    for k in range(NumOfEmoteUser):
        sort_user = df.loc[df.groupby(['user_id']).groups.values()[k]].sort_values(by = 'time')

        # first time register
        time_first_seen[k] = sort_user['time'].iloc[0]

        # first time submit an emote
        lp = range(len(sort_user))
        for i in lp:
            extract_event = sort_user.iloc[i]['event_name']
            if extract_event == 'submit_emote':
                time_first_emote[k] = sort_user.iloc[i]['time']
                break
                
        #time_last_seen
        time_last_seen[k] = sort_user['time'].iloc[-1]
        
    if sign:
        df_time = pd.DataFrame([time_first_seen.values(), time_first_emote.values(), time_last_seen.values()])
        df_time = df_time.transpose().rename(columns = {0:'first seen', 1:'first emote', 2:'last seen'})
    else:
        df_time = pd.DataFrame([time_first_seen.values(), time_last_seen.values()])
        df_time = df_time.transpose().rename(columns = {0:'first seen', 1:'last seen'})

    return df_time

In [103]:
# draw user retention treand between users who submitted verus did not submit emote

def draw_user_retention(df):
    
    [user_not_submit_emote, user_submit_emote] = events.segment_emote_users(df)
    
    # for users who did not submit an emote
    df_user_not_submit = user_retention(user_not_submit_emote, 0)
    days_not_user_stay = [0.001*(int(df_user_not_submit['last seen'][i]) - 
                                 int(df_user_not_submit['first seen'][i]))/3600/24
                            for i in range(len(df_user_not_submit['first seen']))]
    days_not_user_stay = [days_not_user_stay[i]+1 for i in range(len(days_not_user_stay))]

    # for users who submit an emote
    df_user_submit = user_retention(user_submit_emote)
    days_user_stay = [0.001*(int(df_user_submit['last seen'][i]) - int(df_user_submit['first seen'][i]))/3600/24
                        for i in range(len(df_user_submit['first seen']))]

    line1, = plt.plot(range(len(user_array(days_not_user_stay))), 
                      user_normalize(user_array(days_user_stay)[0:len(user_array(days_not_user_stay))]),
                      color = 'blue', linewidth=6)
    line2, = plt.plot(range(len(user_array(days_not_user_stay))), user_normalize(user_array(days_not_user_stay)), 
                      linewidth=6, color = 'red')
    
    plt.legend([line1, line2], ['Users who have submit emote', 'Users who have not submit emote'], 
                fontsize = 14, loc = 4)
    plt.xlabel('Number of Days Remain Active', fontsize=18)
    plt.ylabel('Percentage of Users', fontsize=18)
    plt.xticks(fontsize = 18)
    plt.yticks(fontsize = 18)
    plt.title('User Retention 2015-2016', fontsize=18)
    plt.show()

** user_retention is used to calculate when the users were first seen and last seen**

In [97]:
user_retention(user_submit_emote, sign=1)

Unnamed: 0,first seen,first emote,last seen
0,1440175111193,1440438469001,1463693135025
1,1440174265211,1440174961333,1465253866656
2,1440174229332,1440174907770,1465247440734
3,1440174406675,1440174821463,1464987047205
4,1440174270472,1440175173873,1463002327244
5,1440174400807,1440174922483,1465247488839
6,1440174435795,1440174926092,1442594854151
7,1440174280680,1440174916719,1464797964625
8,1440174238894,1440174377721,1461866924682
9,1440174368987,1440175202333,1464985906955


** draw_user_retention draws out the curve that depicts the drop-off trends. **

In [104]:
draw_user_retention(s.df)

## Module 4:  prediction model (prediction.py)

- Description: Module 4 builds a predictive model for user drop-offs


- Functions: get\_users, sessions\_1st\_week, events\_1st\_week, define\_features, norm\_data, feature\_importance,                prediction


In [123]:
'''''''''''''''''''''''''''
 Module for prediction
'''''''''''''''''''''''''''
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
from pylab import *

import sklearn.svm
import sklearn.lda
import sklearn.cross_validation
from sklearn import linear_model

from sklearn import datasets
from sklearn import metrics
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression



In [124]:
# get the entire data set for training and testing for one school, with labels
def get_users(df, sign = 1):
    
    ''' 
    Args:
        df is the segmented users: submit an emote VS not submit an emote.
        sign = 0 is user not submit; sign = 1 default, is user who submit an emote
    '''
    
    user_ID = {}
    number_of_event = {}
    time_first_seen = {}
    time_first_emote = {}
    time_last_seen = {}

    NumOfEmoteUser = len(df.groupby('user_id'))

    for k in range(NumOfEmoteUser):
        sort_user = df.loc[df.groupby(['user_id']).groups.values()[k]].sort_values(by = 'time')
        
        # user ID
        user_ID[k] = sort_user['user_id'].iloc[0]

        # unique events
        number_of_event[k] = len(set(sort_user['event_name']))

        # first time register
        time_first_seen[k] = sort_user['time'].iloc[0]

        # first time submit an emote
        lp = range(len(sort_user))
        for i in lp:
            extract_event = sort_user.iloc[i]['event_name']
            if extract_event == 'submit_emote':
                time_first_emote[k] = sort_user.iloc[i]['time']
                break

        #time_last_seen
        time_last_seen[k] = sort_user['time'].iloc[-1]

    if sign:
        df_time = pd.DataFrame([user_ID.values(), time_first_seen.values(), time_first_emote.values(),
                                time_last_seen.values(), number_of_event.values()])
        df_time = df_time.transpose().rename(columns = {0: 'user_ID', 1: 'first seen', 2:'first emote', 
                                                        3:'last seen', 4: 'unique events'})
    else:
        df_time = pd.DataFrame([time_first_seen.values(), time_last_seen.values()])
        df_time = df_time.transpose().rename(columns = {0:'first seen', 1:'last seen'})
    
    # label users based on their active index
    df_time['active days'] = [0.001*(int(df_time['last seen'][i]) - int(df_time['first seen'][i]))/3600/24 
                         for i in range(len(df_time))]
    df_time = df_time.sort_values(by='active days')
    df_time['active index'] = [df_time['active days'].iloc[i]/max(df_time['active days']) for i in range(len(df_time))]
    
    label = [np.zeros(len([x for x in df_time['active index'] if x < 0.1])).tolist() + 
             np.ones(len([x for x in df_time['active index'] if x < 0.55 and x >0.1])).tolist() +
            (2*np.ones(len([x for x in df_time['active index'] if x < 0.9 and x >0.55]))).tolist() +
            (3*np.ones(len([x for x in df_time['active index'] if x > 0.9]))).tolist()] 
    df_time['label'] = label[0]
    df_time = df_time.reset_index()
    df_time = df_time.drop('index', 1)
    
    return df_time

In [137]:
df_time = get_users(user_submit_emote)

In [138]:
df_time.head()

Unnamed: 0,user_ID,first seen,first emote,last seen,unique events,active days,active index,label
0,4304068568078895,1440174333681,1440174825265,1440177491034,23,0.036543,0.000126,0.0
1,131921401932564,1440174435795,1440174926092,1442594854151,19,28.014101,0.096262,0.0
2,6849849736271290,1440174315784,1440174806062,1444923805720,26,54.970948,0.188891,1.0
3,3683755132721978,1440174237534,1440174875038,1448049926386,25,91.153806,0.313222,1.0
4,3545662722757577,1440174309709,1440174902721,1457045537949,26,195.268845,0.670981,2.0


In [128]:
# function to find how many sessions in the first week
def sessions_1st_week(userID, users_submit_emote):
    submit_groups = users_submit_emote.groupby('user_id')
    all_session = sorted(submit_groups.get_group(userID).groupby('session_time').groups.keys())
    start_time = int(all_session[0])
    for i in range(len(all_session)):
        duration = 0.001*(int(all_session[i]) - start_time)/3600/24
        if duration > 7:
            break
    return i

In [129]:
# function to find the list of events for every user in the first week
def events_1st_week(userID, users_submit_emote):
    submit_groups = users_submit_emote.groupby('user_id')
    start_time = int(submit_groups.get_group(userID)['time'].iloc[0])
    for i in range(len(submit_groups.get_group(userID))):
        duration = 0.001*(int(submit_groups.get_group(userID)['time'].iloc[i]) - start_time)/3600/24
        if duration > 7:
            break
    return submit_groups.get_group(userID).iloc[0:i]

In [207]:
# function to define and add features as columns to the data frame
def define_features(df_time, users_submit_emote):
    
    '''
    Args:
        df_time: data frame returned from get_users
        users_submit_emote: users who submit emote
    '''
    
    session_1week = {}
    submit_groups = users_submit_emote.groupby('user_id')
    for i in submit_groups.groups.keys():
        session_1week[i] = sessions_1st_week(i, users_submit_emote)

    df_time['f1:session'] = [session_1week[i] for i in df_time['user_ID']]
    df_time['f1:session'] = [1.0*(df_time['f1:session'][i]-np.mean(df_time['f1:session']))/np.std(df_time['f1:session'])
                            for i in range(len(df_time))]

    same_day = [0.001*(int(df_time['first emote'][i]) - int(df_time['first seen'][i]))/3600/24
                for i in range(len(df_time['first seen']))]

    # features to add: all features are quantified based on the first week of activity 
    emote_1st_week = [0 if same_day[i]>14 else 1 for i in range(len(same_day))] 
    df_time['f2:1stEmote'] = emote_1st_week

    NumOfEmotes = [1.0*users_submit_emote.loc[events_1st_week(i,users_submit_emote).index.values]
                   ['event_name'].tolist().count('submit_emote')/np.ceil(df_time[df_time['user_ID']==i]
                   ['active days']) for i in df_time['user_ID']]
    NumOfViewNewEmote = [1.0*users_submit_emote.loc[events_1st_week(i,users_submit_emote).index.values]
                         ['event_name'].tolist().count('Viewed new emote page 1')/np.ceil(df_time[df_time['user_ID']==i]
                         ['active days']) for i in df_time['user_ID']]
    NumOfViewEmoteDetail = [1.0*users_submit_emote.loc[events_1st_week(i,users_submit_emote).index.values]
                            ['event_name'].tolist().count('Viewed Emote detail')/np.ceil(df_time[df_time['user_ID']==i]
                            ['active days']) for i in df_time['user_ID']]
    NumOfViewEmoteOverview = [1.0*users_submit_emote.loc[events_1st_week(i,users_submit_emote).index.values]
                              ['event_name'].tolist().count('Viewed Emote overview')/np.ceil(df_time[df_time['user_ID']==i]
                              ['active days']) for i in df_time['user_ID']]
    NumOfViewCharts = [1.0*users_submit_emote.loc[events_1st_week(i,users_submit_emote).index.values]
                       ['event_name'].tolist().count('Viewed Charts')/np.ceil(df_time[df_time['user_ID']==i]
                       ['active days']) for i in df_time['user_ID']]
    NumOfTouch = [1.0*users_submit_emote.loc[events_1st_week(i,users_submit_emote).index.values]
                  ['event_name'].tolist().count('Touched free text during submission')/np.ceil(df_time[df_time['user_ID']==i]
                  ['active days']) for i in df_time['user_ID']]
    NumOfTimeline = [1.0*users_submit_emote.loc[events_1st_week(i,users_submit_emote).index.values]
                     ['event_name'].tolist().count('Viewed Timeline')/np.ceil(df_time[df_time['user_ID']==i]
                     ['active days']) for i in df_time['user_ID']]
    NumOfMiddle = [1.0*users_submit_emote.loc[events_1st_week(i,users_submit_emote).index.values]
                   ['event_name'].tolist().count('Roster: click on main middle part')/np.ceil(df_time[df_time['user_ID']==i]
                   ['active days']) for i in df_time['user_ID']]
    NumOfProfile = [1.0*users_submit_emote.loc[events_1st_week(i,users_submit_emote).index.values]
                    ['event_name'].tolist().count('Viewed student profile')/np.ceil(df_time[df_time['user_ID']==i]
                    ['active days']) for i in df_time['user_ID']]

    df_time['f3:NumOfEmotes'] = NumOfEmotes
    df_time['f4:New Emote'] = NumOfViewNewEmote
    df_time['f5:Emote Detail'] = NumOfViewEmoteDetail
    df_time['f6:Emote Overview'] = NumOfViewEmoteOverview
    df_time['f7:Charts'] = NumOfViewCharts
    df_time ['f8:Touch'] = NumOfTouch
    df_time['f9:Timeline'] = NumOfTimeline
    df_time['f10:Middle'] = NumOfMiddle
    df_time ['f11:Profile'] = NumOfProfile
    
    return df_time

In [208]:
df_time = define_features(df_time, user_submit_emote)
df_time.head()

Unnamed: 0,user_ID,first seen,first emote,last seen,unique events,active days,active index,label,f1:session,f2:1stEmote,f3:NumOfEmotes,f4:New Emote,f5:Emote Detail,f6:Emote Overview,f7:Charts,f8:Touch,f9:Timeline,f10:Middle,f11:Profile
0,4304068568078895,1440174333681,1440174825265,1440177491034,23,0.036543,0.000126,0.0,-1.09788,1,"0 4.0 Name: active days, dtype: float64","0 4.0 Name: active days, dtype: float64","0 22.0 Name: active days, dtype: float64","0 15.0 Name: active days, dtype: float64","0 0.0 Name: active days, dtype: float64","0 1.0 Name: active days, dtype: float64","0 0.0 Name: active days, dtype: float64","0 9.0 Name: active days, dtype: float64","0 4.0 Name: active days, dtype: float64"
1,131921401932564,1440174435795,1440174926092,1442594854151,19,28.014101,0.096262,0.0,-0.956218,1,"1 0.0 Name: active days, dtype: float64","1 0.0 Name: active days, dtype: float64","1 0.0 Name: active days, dtype: float64","1 0.0 Name: active days, dtype: float64","1 0.0 Name: active days, dtype: float64","1 0.0 Name: active days, dtype: float64","1 0.0 Name: active days, dtype: float64","1 0.0 Name: active days, dtype: float64","1 0.034483 Name: active days, dtype: float64"
2,6849849736271290,1440174315784,1440174806062,1444923805720,26,54.970948,0.188891,1.0,-0.814556,1,"2 0.0 Name: active days, dtype: float64","2 0.0 Name: active days, dtype: float64","2 0.0 Name: active days, dtype: float64","2 0.0 Name: active days, dtype: float64","2 0.0 Name: active days, dtype: float64","2 0.0 Name: active days, dtype: float64","2 0.0 Name: active days, dtype: float64","2 0.0 Name: active days, dtype: float64","2 0.018182 Name: active days, dtype: float64"
3,3683755132721978,1440174237534,1440174875038,1448049926386,25,91.153806,0.313222,1.0,1.310372,1,"3 0.0 Name: active days, dtype: float64","3 0.0 Name: active days, dtype: float64","3 0.0 Name: active days, dtype: float64","3 0.0 Name: active days, dtype: float64","3 0.0 Name: active days, dtype: float64","3 0.0 Name: active days, dtype: float64","3 0.0 Name: active days, dtype: float64","3 0.0 Name: active days, dtype: float64","3 0.097826 Name: active days, dtype: float64"
4,3545662722757577,1440174309709,1440174902721,1457045537949,26,195.268845,0.670981,2.0,0.743725,1,"4 0.0 Name: active days, dtype: float64","4 0.0 Name: active days, dtype: float64","4 0.0 Name: active days, dtype: float64","4 0.0 Name: active days, dtype: float64","4 0.0 Name: active days, dtype: float64","4 0.0 Name: active days, dtype: float64","4 0.0 Name: active days, dtype: float64","4 0.0 Name: active days, dtype: float64","4 0.096939 Name: active days, dtype: float64"


In [232]:
# split and standardize data
def norm_data(df_time):

    data_set = df_time[['f1:session','f2:1stEmote','f3:NumOfEmotes', 'f4:New Emote', 'f5:Emote Detail',
    'f6:Emote Overview', 'f7:Charts', 'f8:Touch', 'f9:Timeline', 'f10:Middle', 'f11:Profile','label']]
    train, test = sklearn.cross_validation.train_test_split(data_set, test_size = 0.25, 
                                                            train_size = 0.75, random_state = 123)

    train_scale = sklearn.preprocessing.StandardScaler().fit(train.drop(['label'],1))
    train_transform = train_scale.transform(train.drop(['label'],1))
    test_scale = sklearn.preprocessing.StandardScaler().fit(test.drop(['label'],1))
    test_transform = train_scale.transform(test.drop(['label'],1))

    data_scale = sklearn.preprocessing.StandardScaler().fit(data_set.drop(['label'],1))
    data_transform = data_scale.transform(data_set.drop(['label'],1))
    
    return train_transform, train, test_transform, test

In [242]:
[train_transform, train, test_transform, test] = norm_data(df_time)

In [35]:
def feat_importance(df_time):
    model = ExtraTreesClassifier()
    model.fit(df_time[['f1:session','f2:1stEmote','f3:NumOfEmotes', 'f5:New Emote', 'f6:Emote Detail',
    'f7:Emote Overview', 'f8:Charts', 'f9:Touch', 'f10:Timeline', 'f11:Middle', 'f12:Profile']], df_time['label'])
    # display the relative importance of each attribute
    print(model.feature_importances_)
    plt.barh(range(len(model.feature_importances_)), sorted(model.feature_importances_))
    plt.yticks(np.arange(11)+0.5, reversed(['Profile', 'Session', 'NumOfEmotes', 'Middle Part', 'Touch', 
                                            'Emote Detail', 'Emote Overview', '1st Emote', 
                                             'New Emote', 'Timeline','Touch Text']))
    title('Feature Importance', size =20)
    plt.show()

In [226]:
def plot_confusion_matrix(cm, title='Confusion matrix', cmap=plt.cm.Blues):
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(4)
    plt.xticks(tick_marks, ['1 month', '3 months', '6 months', 'over 6 months'], rotation=45)
    plt.yticks(tick_marks, ['1 month', '3 months', '6 months', 'over 6 months'])
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [245]:
def prediction(train_transform, train, test_transform, test):
    # fit model
    ## lda model
    lda_clf = sklearn.lda.LDA()
    lda_clf.fit(train_transform, train['label'])

    # prediction
    print('1st sample from test dataset classified as:', lda_clf.predict(test_transform))
    print('actual class label:', test['label'])
    
    # confusion matrix
    print('Confusion Matrix of the LDA-classifier')
    print(metrics.confusion_matrix(test['label'], lda_clf.predict(test_transform)))

    y_pred = lda_clf.predict(test_transform)
    y_test = test['label']

    # Compute confusion matrix
    cm = sklearn.metrics.confusion_matrix(y_test, y_pred)
    np.set_printoptions(precision=2)

    # Normalize the confusion matrix by row (i.e by the number of samples
    # in each class)
    cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    print('Normalized confusion matrix')
    print(cm_normalized)
    plt.figure()
    plot_confusion_matrix(cm_normalized, title='Normalized confusion matrix')
    plt.show()

In [246]:
prediction(train_transform, train, test_transform, test)

('1st sample from test dataset classified as:', array([ 3.,  3.,  3.,  3.,  3.,  3.,  2.]))
('actual class label:', 24    3.0
5     2.0
7     2.0
8     2.0
25    3.0
11    3.0
20    3.0
Name: label, dtype: float64)
Confusion Matrix of the LDA-classifier
[[0 3]
 [1 3]]
Normalized confusion matrix
[[ 0.    1.  ]
 [ 0.25  0.75]]


