#  GTA News 50 Clusters: generate input for packing

## Load 50 cluster dataframe

In [1]:
import os
from os import listdir
from os.path import isfile, join
import pandas as pd
from collections import Counter

# directories
dir_path = os.getcwd()
#print('Working dir: ' + dir_path)

local_path = dir_path + '\\..\\gta-news\\doc2vec\data\\'
df = pd.read_pickle(local_path+'backup'+'-gta.50'+'.pickle')

In [2]:
df.head()

Unnamed: 0,date,title,url,cluster,vector
0,2019-04-20,"""this is why we can't have nice things in nyc,...",http://dagblog.com/reader-blogs/why-we-cant-ha...,13,"[-0.47769657, 0.3672163, 0.23536347, 0.5756423..."
1,2019-04-20,rural book borrowing in peril as libraries sla...,http://easternontarionetwork.com/2019/04/20/ru...,13,"[-0.5722261, -0.26479113, -0.1152498, 0.664171..."
2,2019-04-20,"""discussing canada's new us-focused cannabis e...",http://www.benzinga.com/markets/cannabis/19/04...,11,"[0.54588157, -0.26139393, -0.18809983, -0.0384..."
3,2019-04-20,serea restaurant slated to open in hotel del c...,http://www.coronadonewsca.com/news/coronado_ho...,25,"[-0.28711024, -0.2106441, -0.61289483, 0.04533..."
4,2019-04-20,federal trial of vernon man accused of abducti...,http://www.courant.com/news/connecticut/hc-new...,46,"[0.062113207, 0.0157832, 0.23044105, 0.4720517..."


## Add day column

In [3]:
dfAggr = df[['date','cluster']].groupby(['date','cluster'])\
    .first().sort_values(['date','cluster']).reset_index()

dfCount = df.groupby(['date','cluster'])['cluster']\
    .agg('count').to_frame('count')\
    .sort_values(['date','cluster']).reset_index()['count']

dfAggr['count'] = dfCount
dfAggr.head()

Unnamed: 0,date,cluster,count
0,2019-01-21,0,8
1,2019-01-21,1,2
2,2019-01-21,2,3
3,2019-01-21,3,2
4,2019-01-21,4,1


In [5]:
dfChart = dfAggr #[['date','cluster','count']].sort_values(['date','count'], ascending=[True,True])

#add 'day' column
new_year_day = pd.Timestamp(year=2019, month=1, day=1)
dfChart['day'] = 0
for i, row in dfChart.iterrows():
    dfChart.at[i,'day'] = (row['date'] - new_year_day).days + 1


dfChart.head()

Unnamed: 0,date,cluster,count,day
0,2019-01-21,0,8,21
1,2019-01-21,1,2,21
2,2019-01-21,2,3,21
3,2019-01-21,3,2,21
4,2019-01-21,4,1,21


## Add missing rows

In [None]:
num_clusters = 50

missing_rows = []
cur_day = -1
cl_num = 0
len_total = len(dfChart.index)
#print(len_total)
for i, row in dfChart.iterrows():

    # day changed
    if cur_day != row.day:
        cur_day = row.day
        cl_num = 0
    
    # create missing cluster row(s) before row.cluster
    while cl_num != row.cluster:
        d = pd.DataFrame({'date':[row.date],'cluster':[cl_num],'count':[0],'day':[row.day]})
        missing_rows.append(d)
        print ("Created missing row: day:" + str(row.day) + ", cluster:" + str(cl_num))
        cl_num += 1
        if cl_num >= num_clusters:
            break

    # create missing cluster row(s) after row.cluster to end
    if i < len_total:
        if i == len_total -1:
            next_day = -1
        else:
            next_day = dfChart.at[i+1,'day']
        if next_day != cur_day:
            cl_num += 1
            while cl_num < num_clusters:
                d = pd.DataFrame({'date':[row.date],'cluster':[cl_num],'count':[0],'day':[row.day]})
                missing_rows.append(d)
                print ("Created missing row: day:" + str(row.day) + ", cluster:" + str(cl_num))
                cl_num += 1
                if cl_num >= num_clusters:
                    break
            
    cl_num += 1

In [13]:
dfChart2 = dfChart.append(missing_rows, sort=True)
dfChart2 = dfChart2.sort_values(['day','count','cluster'], ascending=[True,False,True])
dfChart2 = dfChart2.reset_index(drop=True)

dfChart2[dfChart2.day == 110].head()

Unnamed: 0,cluster,count,date,day
4450,46,11,2019-04-20,110
4451,3,7,2019-04-20,110
4452,13,5,2019-04-20,110
4453,14,5,2019-04-20,110
4454,17,4,2019-04-20,110


## Save CSV

In [17]:
dfChart2[['day','date','cluster','count']].to_csv("packing_input.csv", index=False)