## Reform data for CNN and LSTM models

In [None]:
!pip install sklearn
!pip install pandas
!pip install numpy

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np

In [3]:
input_path = '../data/'
df = pd.read_csv(input_path+'clean_df.csv')
df.columns

Index(['user_no', 'key_label', 'event', 'event_time', 'post_id', 'news_time',
       'title', 'abstract', 'content', 'industry_name_sc', 'concept_name_sc',
       'industry', 'concept', 'tags', 'clean_content', 'clean_title',
       'clean_abstract'],
      dtype='object')

In [4]:
sort_df = df.sort_values(['event_time'],ascending=True).groupby('user_no')

In [5]:
top_df = sort_df.head(3).reset_index()[['user_no', 'event_time', 'key_label', 'clean_title', 'clean_abstract', 'clean_content', 'tags']]
top_df.head(2)

Unnamed: 0,user_no,event_time,key_label,clean_title,clean_abstract,clean_content,tags
0,2321,2022-04-01 14:57:00,0,News highlights Top global markets news of th...,U S stocks declined and oil fell as President...,p strong Dow Falls Points as U S Eyes Rese...,巴菲特持仓 富国银行持仓 纽约梅隆银行持仓 瑞士信贷持仓 瑞银集团持仓 热门ETF 红杉资...
1,2321,2022-04-01 14:57:10,0,News highlights Top global markets news of th...,OPEC and its allies including Russia agreed to...,p strong OPEC Sticks to Production Plan Desp...,昨日强势股 热门ETF 巴菲特持仓 富国银行持仓 纽约梅隆银行持仓 瑞士信贷持仓 瑞银集团...


In [6]:
top_df['input_info'] = top_df['clean_title'] +' '+top_df['clean_abstract']+' ' + top_df['clean_content']

In [7]:
top_df.head(3)

Unnamed: 0,user_no,event_time,key_label,clean_title,clean_abstract,clean_content,tags,input_info
0,2321,2022-04-01 14:57:00,0,News highlights Top global markets news of th...,U S stocks declined and oil fell as President...,p strong Dow Falls Points as U S Eyes Rese...,巴菲特持仓 富国银行持仓 纽约梅隆银行持仓 瑞士信贷持仓 瑞银集团持仓 热门ETF 红杉资...,News highlights Top global markets news of th...
1,2321,2022-04-01 14:57:10,0,News highlights Top global markets news of th...,OPEC and its allies including Russia agreed to...,p strong OPEC Sticks to Production Plan Desp...,昨日强势股 热门ETF 巴菲特持仓 富国银行持仓 纽约梅隆银行持仓 瑞士信贷持仓 瑞银集团...,News highlights Top global markets news of th...
2,2321,2022-04-01 14:57:16,0,Intel to buy optimization software business Gr...,Intel Corp said it is buying Israel based dev...,div class ftEditor p By Will Feuer p di...,半导体 纽文持仓 摩根士丹利持仓 IDC概念 法国巴黎银行持仓 景顺持仓 北方信托银行持仓 ...,Intel to buy optimization software business Gr...


In [8]:
new_df = pd.DataFrame(columns=['user_no', 'key_label', 'input_1', 'input_2', 'input_3'])

In [9]:
new_df['user_no'] = top_df['user_no'].unique()

In [10]:
new_df.head()

Unnamed: 0,user_no,key_label,input_1,input_2,input_3
0,2321,,,,
1,1168,,,,
2,2130,,,,
3,1085,,,,
4,2536,,,,


In [11]:
unique_users = top_df['user_no'].unique()

In [12]:
labels = []
for u in unique_users:
    label = top_df[top_df.user_no==u].key_label.unique()[0]
    labels.append(label)

In [13]:
new_df['key_label'] = labels

In [14]:
input_1 = []
input_2 = []
input_3 = []
for u in unique_users:
    user_df = top_df[top_df['user_no'] == u][['user_no', 'input_info']]
    input_1.append(user_df[user_df['user_no']==u]['input_info'].values[0])
    try: 
        input_2.append(user_df[user_df['user_no']==u]['input_info'].values[1])
    except: 
        input_2.append(' ') # padding
    try: 
        input_3.append(user_df[user_df['user_no']==u]['input_info'].values[2])
    except: 
        input_3.append(' ') # padding

In [15]:
new_df['input_1'] = input_1
new_df['input_2'] = input_2
new_df['input_3'] = input_3

In [16]:
new_df.head()

Unnamed: 0,user_no,key_label,input_1,input_2,input_3
0,2321,0,News highlights Top global markets news of th...,News highlights Top global markets news of th...,Intel to buy optimization software business Gr...
1,1168,0,Shares Of Penny Stock Tonix Pharma Tick Higher...,Tonix Pharmaceuticals to Participate in the No...,Tonix Pharmaceuticals to Participate in the No...
2,2130,0,Cathie Wood Michael Saylor and Mayor Suarez s...,Dear PLTR Stock Fans Mark Your Calendars for ...,Tesla reports earnings Wednesday Investors sh...
3,1085,0,,Veru shares increasing Covid drug candidat...,Veru s Sabizabulin Shows Reduction in Deaths...
4,2536,0,Will Amazon Stock Split Offset Union Worries ...,Saudi Prince Alwaleed bin Talal Rejects Musk s...,Twitter moves to block Elon Musk from increasi...


In [17]:
new_df.shape

(4475, 5)

In [18]:
new_df.to_csv(input_path+'user_top_3_data.csv', index=False)