In [2]:
import pandas as pd
pd.options.display.max_columns = 100
import json, itertools
import time
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime, timedelta

# Figuring out number of active sessions

We need to find the first session of each member. Since we don't have that info as a proper date, we can only figure it out from the transcript data. We define a person's first session as the first time they intervene.

In [4]:
n_words = pd.read_csv('data/n_words_clean.csv').set_index('PersonIdCode')

We build a new DataFrame where we'll store our results.

In [6]:
df = pd.DataFrame(columns=['PersonIdCode', 'FirstSession'])
df.PersonIdCode = n_words.index
df.set_index('PersonIdCode', inplace=True)
df.fillna(0, inplace=True)
df.head(2)

Unnamed: 0_level_0,FirstSession
PersonIdCode,Unnamed: 1_level_1
2023,0
2061,0


We iterate through rows and for each member we spot his/her first active sessions, i.e. session where he/she spoke.

In [8]:
for row in n_words.iterrows():
    mmbr = row[0]
    row = row[1]
    for i,j in row.iteritems():
        # if the person utters some words, we set that sesh as first sesh
        if j > 0:
            df.loc[mmbr, 'FirstSession'] = i
            #print(mmbr, i)
            break

In [9]:
df.head()

Unnamed: 0_level_0,FirstSession
PersonIdCode,Unnamed: 1_level_1
2023,4802
2061,4603
2129,4601
2143,4601
2147,4601


Now we find out the date of these sessions.

In [10]:
df_sesh = pd.read_csv('data/sessions.csv')

df['FirstSessionDate'] = 0

In [11]:
for row in df.iterrows():
    mmbr = row[0]
    row = row[1]
    sesh = row['FirstSession']
    if int(sesh) > 0:
        date = df_sesh.loc[df_sesh.ID == int(sesh)]['StartDate']
        date = date.values[0]
        date = date[:date.find('T')]
        df.loc[mmbr, 'FirstSessionDate'] = date

In [12]:
df.tail()

Unnamed: 0_level_0,FirstSession,FirstSessionDate
PersonIdCode,Unnamed: 1_level_1,Unnamed: 2_level_1
3107,5001,2015-11-30
3108,5002,2016-02-29
3109,5001,2015-11-30
3110,5001,2015-11-30
3604,4901,2011-12-05


Next, we compute members' seniority, in days.

In [13]:
today = datetime.now().date()

In [14]:
def seniority(row):
    then = row['FirstSessionDate']
    #print(then)
    if then != 0:
        then = datetime.strptime(then, '%Y-%m-%d').date()
        delta = today - then
        return int(delta.days)
    else:
        pass

In [15]:
df['Seniority'] = df.apply(seniority, axis=1)

df.fillna(0, inplace=True)

df['Seniority'] = df['Seniority'].apply(lambda x: int(x))

df.tail()

Unnamed: 0_level_0,FirstSession,FirstSessionDate,Seniority
PersonIdCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
3107,5001,2015-11-30,399
3108,5002,2016-02-29,308
3109,5001,2015-11-30,399
3110,5001,2015-11-30,399
3604,4901,2011-12-05,1855


Adding this info to the people DataFrame

In [18]:
ppl = pd.read_csv('data/people_jonas.csv').set_index('PersonIdCode')

In [19]:
ppl.sort_index(inplace=True)

Checking if the order is correct

In [22]:
not (False in (ppl.index == df.index))

True

Adding new columns

In [23]:
ppl['FirstSession'] = df['FirstSession']
ppl['FirstSessionDate'] = df['FirstSessionDate']
ppl['Seniority'] = df['Seniority']

Saving

In [None]:
ppl.to_csv('data/people_jonas.csv')