### Import Libraries

In [None]:
import pandas as pd
import re
import csv

### Import Original Training Data

In [None]:
with open('train.csv') as f:
    df = pd.read_csv(f, header = None, names = ["label","headline", "body"], index_col=False)
f.close()

In [None]:
df.head()

Unnamed: 0,label,headline,body
0,3,Wall St. Bears Claw Back Into the Black (Reuters),"Reuters - Short-sellers, Wall Street's dwindli..."
1,3,Carlyle Looks Toward Commercial Aerospace (Reu...,Reuters - Private investment firm Carlyle Grou...
2,3,Oil and Economy Cloud Stocks' Outlook (Reuters),Reuters - Soaring crude prices plus worries\ab...
3,3,Iraq Halts Oil Exports from Main Southern Pipe...,Reuters - Authorities have halted oil export\f...
4,3,"Oil prices soar to all-time record, posing new...","AFP - Tearaway world oil prices, toppling reco..."


In [None]:
df.groupby('label').count()

Unnamed: 0_level_0,headline,body
label,Unnamed: 1_level_1,Unnamed: 2_level_1
1,30000,30000
2,30000,30000
3,30000,30000
4,30000,30000


### Downsample Training Set to 

In [None]:
sample_df = df.groupby('label').apply(lambda x: x.sample(frac=0.05, random_state = 10)).droplevel(0).reset_index(drop = True)

In [None]:
sample_df.head()

Unnamed: 0,label,headline,body
0,1,Chinese deputy Prime Minister attacks #39;Bus...,BEIJING: On the eve of the US Presidential pol...
1,1,2 held over Madrid train bombings,"Madrid, Spain (CNN) -- Police arrested two men..."
2,1,Opinion poll boost for Australian PM,An Australian opinion poll has indicated a ris...
3,1,Iran could agree to six month uranium enrichme...,"TEHRAN, Nov 2 (AFP) - Iran is prepared to susp..."
4,1,Sudan Talks in Nigeria Said Deadlocked,ABUJA (Reuters) - After two weeks of talks to...


## EAD of 10% Training Size, n_aug = 9 (the number of generated augmented sentences per original sentence )

---



### Downsample Training Data to 10%

In [None]:
sample_df_10pc = df.groupby('label').apply(lambda x: x.sample(frac=0.10, random_state = 10)).droplevel(0).reset_index(drop = True)
sample_df_10pc[["label","body"]].to_csv("sample_train_10pc.txt", index = False, header = False, sep = "\t")

add the txt file into **improving-pet/data_augmentation/eda_nlp/data/**


*   add the txt file into **improving-pet/data_augmentation/eda_nlp/data/**
*   run python ../code/augment.py --input=sample_train_10pc.txt --num_aug = 9 (or desired n_aug)
*   this will create a new file with eda_ appended to the original .txt file






### Import Augmented Downsmapled Training Data

In [None]:
df_10pc = pd.read_csv('eda_sample_train_10pc.txt', header = None, sep = "\t", names = ["label", "body"])

In [None]:
df_10pc.groupby('label').count()

Unnamed: 0_level_0,body
label,Unnamed: 1_level_1
1,30000
2,30000
3,30000
4,30000


making sure that the augmented data is the same size as the original.

In [None]:
n = 10
df_10pc = pd.read_csv('eda_sample_train_10pc.txt', header = None, sep = "\t", names = ["label", "body"])
headline = pd.DataFrame(sample_df_10pc["headline"])
headline = headline.loc[headline.index.repeat(n)].reset_index(drop = True)
df_10pc["headline"] = headline['headline']
df_10pc = df_10pc[["label", "headline", "body"]]
df_10pc["label"] = df_10pc["label"].astype(str)
df_10pc["headline"] = df_10pc["headline"].astype(str)
df_10pc["body"] = df_10pc["body"].astype(str)
df_10pc.to_csv("eda_train_10pc.csv", index = False, header = False, quoting=csv.QUOTE_ALL)

## EAD of 20% Training Size, n_aug = 4

---



In [None]:
sample_df_20pc = df.groupby('label').apply(lambda x: x.sample(frac=0.2, random_state = 10)).droplevel(0).reset_index(drop = True)
sample_df_20pc[["label","body"]].to_csv("sample_train_20pc.txt", index = False, header = False, sep = "\t")

add the txt file into improving-pet/data_augmentation/eda_nlp/data/

- add the txt file into improving-pet/data_augmentation/eda_nlp/data/
- run python ../code/augment.py --input=sample_train_10pc.txt --num_aug = 4 (or desired n_aug)
- this will create a new file with eda_ appended to the original .txt file



In [None]:
df_20pc = pd.read_csv('eda_sample_train_20pc.txt', header = None, sep = "\t", names = ["label", "body"])
df_20pc.groupby('label').count()

Unnamed: 0_level_0,body
label,Unnamed: 1_level_1
1,30000
2,30000
3,30000
4,30000


In [None]:
n = 5
headline = pd.DataFrame(sample_df_20pc["headline"])
headline = headline.loc[headline.index.repeat(n)].reset_index(drop = True)
df_20pc["headline"] = headline['headline']
df_20pc = df_20pc[["label", "headline", "body"]]
df_20pc["label"] = df_20pc["label"].astype(str)
df_20pc["headline"] = df_20pc["headline"].astype(str)
df_20pc["body"] = df_20pc["body"].astype(str)
df_20pc.to_csv("eda_train_20pc.csv", index = False, header = False, quoting=csv.QUOTE_ALL)

## EAD of 25% Training Size, n_aug = 3

---



In [None]:
sample_df_25pc = df.groupby('label').apply(lambda x: x.sample(frac=0.25, random_state = 10)).droplevel(0).reset_index(drop = True)
sample_df_25pc[["label","body"]].to_csv("sample_train_25pc.txt", index = False, header = False, sep = "\t")

In [None]:
df_25pc = pd.read_csv('eda_sample_train_25pc.txt', header = None, sep = "\t", names = ["label", "body"])
df_25pc.groupby('label').count()

Unnamed: 0_level_0,body
label,Unnamed: 1_level_1
1,30000
2,30000
3,30000
4,30000


In [12]:
n = 4
headline = pd.DataFrame(sample_df_25pc["headline"])
headline = headline.loc[headline.index.repeat(n)].reset_index(drop = True)
df_25pc["headline"] = headline['headline']
df_25pc = df_25pc[["label", "headline", "body"]]
df_25pc["label"] = df_25pc["label"].astype(str)
df_25pc["headline"] = df_25pc["headline"].astype(str)
df_25pc["body"] = df_25pc["body"].astype(str)
df_25pc.to_csv("eda_train_25pc.csv", index = False, header = False, quoting=csv.QUOTE_ALL)

In [None]:
df_25pc.groupby('label').count()

Unnamed: 0_level_0,headline,body
label,Unnamed: 1_level_1,Unnamed: 2_level_1
1,30000,30000
2,30000,30000
3,30000,30000
4,30000,30000


In [11]:
df_25pc.head()

Unnamed: 0,label,headline,body
0,1,Chinese deputy Prime Minister attacks #39;Bus...,beijing on the eve of the us presidential poll...
1,1,Chinese deputy Prime Minister attacks #39;Bus...,beijing quot the eve of the us presidential po...
2,1,Chinese deputy Prime Minister attacks #39;Bus...,beijing along on the eve of the us presidentia...
3,1,2 held over Madrid train bombings,beijing on the eve of the us presidential poll...
4,1,2 held over Madrid train bombings,madrid spain cnn police arrested two men on fr...


## Example of Augmented Row (label, headline, body) (n_aug = 4)

- "1","Chinese deputy Prime Minister attacks  #39;Bush doctrine #39;","beijing on the eve of the us presidential poll one of the key architect of china s foreign insurance policy lashed out at the controversial quot vannevar bush doctrine quot of pre emptive strikes the quot catastrophic quot war on iraq"


- "1","Chinese deputy Prime Minister attacks  #39;Bush doctrine #39;","beijing quot the eve of the us presidential poll one of the key architects of china s foreign policy lashed out at the controversial quot bush doctrine quot of pre emptive quot the on catastrophic strikes war on iraq"


- "1","Chinese deputy Prime Minister attacks  #39;Bush doctrine #39;","beijing along on the eve of the us presidential poll one of the key architects of china s foreign policy lashed out at the peking controversial quot central bush doctrine quot of pre emptive strikes the quot catastrophic quot war on iraq"


- "1","Chinese deputy Prime Minister attacks  #39;Bush doctrine #39;","beijing on the eve of the us presidential poll one of the key architects of china s foreign policy lashed out at the controversial quot bush doctrine quot of pre emptive strikes the quot catastrophic quot war on iraq "


- "1","Chinese deputy Prime Minister attacks  #39;Bush doctrine #39;","madrid spain cnn police arrested two men on friday in eastern civilize spain for alleged links to the madrid train bombings last march an interior ministry spokeswoman told kingdom of spain cnn"