In [12]:
import pandas as pd
import os
import csv

In [45]:
languages = ['de', 'en', 'fr', 'jp', 'zh']
categories = ['apparel', 'home', 'musical_instruments', 'sports']
# Create numeric code for each category from 01 to 04
category_code = {'apparel': '01', 'home': '02', 'musical_instruments': '03', 'sports': '04'}

In [44]:
category_code

{'apparel': '01', 'home': '02', 'musical_instruments': '03', 'sports': '04'}

In [50]:
# Turn off pandas caveat
pd.options.mode.chained_assignment = None

In [52]:
df_ipt_vectors = pd.DataFrame()
df_ipt_labels = pd.DataFrame()

for language in languages:
    df_ipt_lang_vectors = pd.DataFrame()
    df_ipt_lang_labels = pd.DataFrame()

    for category in categories:
        vectors_file = f'vectors/{language}/{category}/150/vectors.txt'
        labels_file = f'labels/{language}/{category}/150/labels.txt'
        df_vectors = pd.read_csv(vectors_file, sep='\t', header=None, names=['threadId', 'segmentId', 'text'])
        df_labels = pd.read_csv(labels_file, header=None, names=['LsegmentId', 'label'])

        # Merge vectors and labels
        df = pd.concat([df_vectors, df_labels], axis=1)

        # Get last 10 rows with label 0
        df_0 = df[df['label'] == 0].tail(10)

        # Get last 10 rows with label 1
        df_1 = df[df['label'] == 1].tail(10)

        # Concatenate the two dataframes
        df_ipt = pd.concat([df_0, df_1])

        # Save vector columns to new dataframe
        df_ipt_categ_vectors = df_ipt[['threadId', 'segmentId', 'text']]

        # Add 01 to the end of each threadId and segmentId
        df_ipt_categ_vectors['threadId'] = df_ipt_categ_vectors['threadId'].astype(str) + category_code[category]
        df_ipt_categ_vectors['segmentId'] = df_ipt_categ_vectors['segmentId'].astype(str) + category_code[category]

        # Save labels to new dataframe
        df_ipt_categ_labels = df_ipt[['LsegmentId','label']]

        # Add 01 to the end of each segmentId
        df_ipt_categ_labels['LsegmentId'] = df_ipt_categ_labels['LsegmentId'].astype(str) + category_code[category]

        # Concatenate vectors and labels
        df_ipt_lang_vectors = pd.concat([df_ipt_lang_vectors, df_ipt_categ_vectors])
        df_ipt_lang_labels = pd.concat([df_ipt_lang_labels, df_ipt_categ_labels])
    
    # Create directory for language
    os.makedirs(f'vectors/IPT/{language}', exist_ok=True)
    os.makedirs(f'labels/IPT/{language}', exist_ok=True)
    
    # Save vectors and labels to file
    df_ipt_lang_vectors.to_csv(f'vectors/IPT/{language}/vectors.txt', sep='\t', index=False, header=False)
    df_ipt_lang_labels.to_csv(f'labels/IPT/{language}/labels.txt', index=False, header=False)

    # Concatenate vectors and labels
    df_ipt_vectors = pd.concat([df_ipt_vectors, df_ipt_lang_vectors])
    df_ipt_labels = pd.concat([df_ipt_labels, df_ipt_lang_labels])

# Save vectors and labels to file
df_ipt_vectors.to_csv(f'vectors/IPT/vectors.txt', sep='\t', index=False, header=False)
df_ipt_labels.to_csv(f'labels/IPT/labels.txt', index=False, header=False)
        


In [47]:
df_ipt_vectors

Unnamed: 0,threadId,segmentId,text
65,threadId=5006504,segmentId=5006504,!@#NEXLP_DOCVECTOR_SEPARATOR#@!不错. 拍子很好看，吸汗带挺舒...
66,threadId=5006604,segmentId=5006604,!@#NEXLP_DOCVECTOR_SEPARATOR#@!瑕疵严重. 膝盖那块补丁布的裤...
67,threadId=5006704,segmentId=5006704,!@#NEXLP_DOCVECTOR_SEPARATOR#@!有点儿鸡肋了. 想着和背包组合...
68,threadId=5006804,segmentId=5006804,!@#NEXLP_DOCVECTOR_SEPARATOR#@!带子偏松. 无法固定，镜带偏松...
69,threadId=5006904,segmentId=5006904,!@#NEXLP_DOCVECTOR_SEPARATOR#@!一般般. 商品收到了，还行。做...
70,threadId=5007004,segmentId=5007004,!@#NEXLP_DOCVECTOR_SEPARATOR#@!只是好看！. 材料不是我想象的...
71,threadId=5007104,segmentId=5007104,!@#NEXLP_DOCVECTOR_SEPARATOR#@!不爽. 才穿了一次，就破了个小...
72,threadId=5007204,segmentId=5007204,!@#NEXLP_DOCVECTOR_SEPARATOR#@!钛刀叉勺. 在碗上可以刮出黑道...
73,threadId=5007304,segmentId=5007304,!@#NEXLP_DOCVECTOR_SEPARATOR#@!没有弹力的T恤~~~. 觉得花...
74,threadId=5007404,segmentId=5007404,!@#NEXLP_DOCVECTOR_SEPARATOR#@!舒适度和保暖性不错，但裆部撕烂...


In [48]:
df_ipt_labels

Unnamed: 0,LsegmentId,label
65,5006504,0
66,5006604,0
67,5006704,0
68,5006804,0
69,5006904,0
70,5007004,0
71,5007104,0
72,5007204,0
73,5007304,0
74,5007404,0
