In [1]:
import pandas as pd
import seaborn as sns
import datetime
import numpy as np
from matplotlib import pyplot as plt

#%matplotlib tk

In [2]:
legco_df = pd.read_csv('records-all-with-info.csv')

OSError: File b'records-all-with-info.csv' does not exist

In [None]:
legco_df.describe()

In [None]:
legco_df.head()

In [None]:
def from_str_to_date(date_str):
    t = date_str.split('/')
    day = int(t[0])
    month = int(t[1])
    year = int(t[2])
    dt = datetime.datetime(year, month, day)
    return dt

legco_df['dt'] = legco_df['vote-date'].apply(from_str_to_date)
legco_df.head()

In [None]:
len(legco_df.member_id.unique())

In [None]:
def clean_record(t):
    if t == 'Dr Joseph LEE':
        t = 'Prof Joseph LEE'
    # https://github.com/code4hk/legcohk/issues/1
    if t == '郭偉强':
        t = '郭偉強'
    # Other normalization if any
    # ...
    return t
legco_df.member_id = legco_df.member_id.apply(clean_record)
len(legco_df.member_id.unique())

In [None]:
def to_numeric(x):
    if x == 'Yes':
        return 1
    elif x == 'No':
        return -1
    else:
        return 0
legco_df['vote'] = legco_df['vote'].apply(to_numeric)

In [None]:
legco_df.head()

In [None]:
legco_df_selected = legco_df[['topic_id', 'member_id', 'vote', 'dt']]
legco_df_selected.head()

In [None]:
def get_pc1_from_df(df):
    df_matrix = pd.DataFrame(index=df['member_id'].unique())
    for gn, g in df.groupby('topic_id'):
        df_matrix[gn] = g.set_index('member_id')['vote']
    
    #NOTE: 
    #    For more rigorous study, 
    #    you need to investigate what are those missing values
    df_matrix.fillna(0)
    def clean_values(x):
        x[(x != 1) & (x != -1)] = 0
    df_matrix.apply(clean_values)
    
    X = np.matrix(df_matrix.as_matrix()).astype('float')
    X = X - np.mean(X, 0)
 
    from sklearn.decomposition import PCA
    pca = PCA(n_components=1)
    # X is of dimensions: n_samples x n_features
    X_reduced = pca.fit_transform(X)
    df_pc1 = pd.DataFrame(X_reduced, index=df_matrix.index, columns=['PC1'])
    df_pc1 = df_pc1.sort('PC1')
    #plt.figure(figsize=(12, 20))
    #plt.plot(df_pc1['PC1'], df_pc1.rank()['PC1'], 'd', markersize=10)
    #for (_x, _y, _s) in zip(df_pc1['PC1'], df_pc1.rank()['PC1'], df_pc1.index):
    #    plt.annotate(_s.decode('utf-8'), (_x, _y), xytext=(_x + 0.01, _y - 0.02))
    #plt.title('Spectrum from Principal Component 1')
    #plt.show()
    if df_pc1.T['梁國雄'][0] > df_pc1.T['曾鈺成'][0]:
        # Fix the relative order of landmark nodes
        df_pc1['PC1'] = - df_pc1['PC1'] 
    return df_pc1

df_pc1 = get_pc1_from_df(legco_df_selected)

In [None]:
print legco_df_selected.dt.max()
print legco_df_selected.dt.min()

In [None]:
import time
#from multiprocessing import Pool 
# I have 4 cores. Change this to fit you
#p = Pool(4)

from dateutil.relativedelta import relativedelta 

init_date = datetime.datetime(2012, 12, 26)
cutoff_dates = []
cur_date = init_date
while cur_date < legco_df_selected.dt.max():
    legco_df_selected_cut = legco_df_selected[legco_df_selected.dt < cur_date]
    cur_date += relativedelta(months=1)
    cutoff_dates.append(cur_date)
cutoff_dates.append(cur_date)

def get_1year_records_till(t_end):
    t_begin = t_end - relativedelta(years=1)
    return legco_df_selected[(legco_df_selected.dt >= t_begin) & (legco_df_selected.dt < t_end)]

def get_pc1_at_cutoff_date(t):
    return (t, get_pc1_from_df(get_1year_records_till(t)))

_begin_time = time.time()
# This runs for: 47.6116089821 
PCs = map(lambda t: (t, get_pc1_from_df(get_1year_records_till(t))), cutoff_dates)
# This runs for
#PCs = p.map(get_pc1_at_cutoff_date, cutoff_dates)
_end_time = time.time()
print 'Elapse:', _end_time - _begin_time

scores = pd.DataFrame(index=legco_df_selected.member_id.unique())
for (t, PC) in PCs:
    scores[t] = PC

rankings = scores.rank()
rankings.T['梁國雄']

In [None]:
rankings.T.describe()

In [None]:
rankings.T[['梁國雄', '曾鈺成']].plot()
plt.show()

In [None]:
rankings.T.plot()
plt.show()

In [None]:
rankings_var = pd.DataFrame(rankings.T.var(), columns=['var'])

In [None]:
var_kings = list(rankings_var.sort('var')[-10:-1].index)
print var_kings
rankings.T[var_kings].plot()
plt.show()

In [None]:
rankings.to_csv('rankings.csv')

In [None]:
len(legco_df_selected['member_id'].unique())

In [None]:
legco_df_selected.index = legco_df_selected['dt'].values
vote_num = legco_df_selected.resample('1w', how='count')
vote_num[['vote']].plot()
plt.show()

In [None]:
# The volumne at the peak week
vote_num.sort('dt', ascending=False)
print 1.0 * vote_num.ix['2014-06-01']['dt'] / vote_num.dt[
    (vote_num.index < datetime.datetime(2014,8,1))
    &
    (vote_num.index >= datetime.datetime(2013,8,31))
].sum()
print 1.0 * vote_num.ix['2013-05-19']['dt'] / vote_num.dt[
    (vote_num.index < datetime.datetime(2013,8,1))
    &
    (vote_num.index >= datetime.datetime(2012,8,31))
].sum()

In [None]:
d1 = df_pc1.rank()
d2 = df_pc1
d2['PC1'] = -d2['PC1']
d2 = d2.rank()
d1['another'] = d2['PC1']
d1
df_pc1.T['梁國雄']
df_pc1.T['曾鈺成'][0]

In [None]:
df_matrix = pd.DataFrame(index=legco_df_selected['member_id'].unique())
for gn, g in legco_df_selected.groupby('topic_id'):
    df_matrix[gn] = g.set_index('member_id')['vote']
df_matrix.head(2)

In [None]:
print df_matrix.shape
print np.product(df_matrix.shape)
print len(legco_df_selected)

In [None]:
df_matrix.fillna(0)
df_matrix.describe().T.describe()

In [None]:
df_matrix.describe()