In [2]:
import pandas as pd 

In [23]:
data_path = './newsCorpora.csv'

In [24]:
df = pd.read_csv(data_path, sep='\t', header=None, names=['ID', 'TITLE', 'URL', 'PUBLISHER', 'CATEGORY', 'STORY', 'HOSTNAME', 'TIMESTAMP'])

In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 422419 entries, 0 to 422418
Data columns (total 8 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   ID         422419 non-null  int64 
 1   TITLE      422419 non-null  object
 2   URL        422419 non-null  object
 3   PUBLISHER  422417 non-null  object
 4   CATEGORY   422419 non-null  object
 5   STORY      422419 non-null  object
 6   HOSTNAME   422419 non-null  object
 7   TIMESTAMP  422419 non-null  int64 
dtypes: int64(2), object(6)
memory usage: 25.8+ MB


In [26]:
# Extract example from publisher "Reuters", "Huffington Post", "Businessweek", "Contactmusic.com", "Daily Mail".
df = df[df['PUBLISHER'].isin(['Reuters', 'Huffington Post', 'Businessweek', 'Contactmusic.com', 'Daily Mail'])]
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13340 entries, 12 to 422319
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   ID         13340 non-null  int64 
 1   TITLE      13340 non-null  object
 2   URL        13340 non-null  object
 3   PUBLISHER  13340 non-null  object
 4   CATEGORY   13340 non-null  object
 5   STORY      13340 non-null  object
 6   HOSTNAME   13340 non-null  object
 7   TIMESTAMP  13340 non-null  int64 
dtypes: int64(2), object(6)
memory usage: 938.0+ KB


In [27]:
df['PUBLISHER'].unique()

array(['Reuters', 'Businessweek', 'Huffington Post', 'Daily Mail',
       'Contactmusic.com'], dtype=object)

In [28]:
# Rearrange the extracted examples in random order
df = df.sample(frac=1, random_state=43).reset_index(drop=True)

In [29]:
df['CATEGORY'].unique()

array(['e', 't', 'b', 'm'], dtype=object)

In [30]:
# split to train, valid, test
train_size = int(len(df) * 0.8)
valid_size = int(len(df) * 0.1)
test_size = len(df) - train_size - valid_size
df_train = df[:train_size][["CATEGORY", "TITLE"]]
df_valid = df[train_size:train_size + valid_size][["CATEGORY", "TITLE"]]
df_test = df[-test_size:][["CATEGORY", "TITLE"]]

In [32]:
# Count the number of samples for each label in the train set
train_label_counts = df_train['CATEGORY'].value_counts()

# Count the number of samples for each label in the valid set
valid_label_counts = df_valid['CATEGORY'].value_counts()

# Count the number of samples for each label in the test set
test_label_counts = df_test['CATEGORY'].value_counts()


print("Number of samples for each label in the train set:")
print(train_label_counts)
print("\nNumber of samples for each label in the valid set:")
print(valid_label_counts)
print("\nNumber of samples for each label in the test set:")
print(test_label_counts)


Number of samples for each label in the train set:
CATEGORY
b    4541
e    4182
t    1225
m     724
Name: count, dtype: int64

Number of samples for each label in the valid set:
CATEGORY
b    554
e    529
t    157
m     94
Name: count, dtype: int64

Number of samples for each label in the test set:
CATEGORY
e    568
b    532
t    142
m     92
Name: count, dtype: int64


In [40]:
# Calculate the average, maximum, and minimum number of words in samples in the train set
train_word_counts = df_train['TITLE'].str.split().apply(len)
train_avg_words = train_word_counts.mean()
train_max_words = train_word_counts.max()
train_min_words = train_word_counts.min()

print("\nAverage number of words in each sample in the train set: {:.2f}".format(train_avg_words))
print("Maximum number of words in each sample in the train set:", train_max_words)
print("Minimum number of words in each sample in the train set:", train_min_words)



Average number of words in each sample in the train set: 10.47
Maximum number of words in each sample in the train set: 201
Minimum number of words in each sample in the train set: 2


In [41]:
# Similarly, calculate the average, maximum, and minimum number of words in samples in the valid and test sets
valid_word_counts = df_valid['TITLE'].str.split().apply(len)
valid_avg_words = valid_word_counts.mean()
valid_max_words = valid_word_counts.max()
valid_min_words = valid_word_counts.min()

print("\nAverage number of words in each sample in the valid set: {:.2f}".format(valid_avg_words))
print("Maximum number of words in each sample in the valid set:", valid_max_words)
print("Minimum number of words in each sample in the valid set:", valid_min_words)


Average number of words in each sample in the valid set: 10.39
Maximum number of words in each sample in the valid set: 19
Minimum number of words in each sample in the valid set: 3


In [42]:
test_word_counts = df_test['TITLE'].str.split().apply(len)
test_avg_words = test_word_counts.mean()
test_max_words = test_word_counts.max()
test_min_words = test_word_counts.min()

print("\nAverage number of words in each sample in the test set: {:.2f}".format(test_avg_words))
print("Maximum number of words in each sample in the test set:", test_max_words)
print("Minimum number of words in each sample in the test set:", test_min_words)


Average number of words in each sample in the test set: 10.56
Maximum number of words in each sample in the test set: 18
Minimum number of words in each sample in the test set: 3


In [43]:
with open("train.txt", "w") as f:
    for _, row in df_train.iterrows():
        f.write(f"{row['TITLE']}\t{row['CATEGORY']}\n")
        
with open("valid.txt", "w") as f:
    for _, row in df_valid.iterrows():
        f.write(f"{row['TITLE']}\t{row['CATEGORY']}\n")

with open("test.txt", "w") as f:
    for _, row in df_test.iterrows():
        f.write(f"{row['TITLE']}\t{row['CATEGORY']}\n")