In [14]:
import sklearn
import pandas as pd
import os

from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
import fasttext


In [15]:
datapath = '../data/'

In [16]:
os.listdir(datapath)

['data.csv', 'somefile.txt', 'data.tsv']

In [17]:
with open( datapath + 'somefile.txt', 'r') as file:
    lines = file.readlines()

bodies = []
Titles = []
targets = []

for i in range(0, len(lines), 3):
    Body = lines[i].strip()
    Title = lines[i+1].strip()
    target = lines[i+2].strip()

    bodies.append(Body)
    Titles.append(Title)
    targets.append(target)

df = pd.DataFrame({'Body': bodies, 'Title': Titles, 'target': targets})


In [18]:
df.to_csv(datapath + 'data.csv', index=False)
df.to_csv(datapath + 'data.tsv', sep='\t', index=False)

In [19]:
df.head()

Unnamed: 0,Body,Title,target
0,A small meteorite crashed into a wooded area i...,"Soldier shot, Parliament locked down after gun...",unrelated
1,A small meteorite crashed into a wooded area i...,Tourist dubbed ‘Spider Man’ after spider burro...,unrelated
2,A small meteorite crashed into a wooded area i...,Luke Somers 'killed in failed rescue attempt i...,unrelated
3,A small meteorite crashed into a wooded area i...,BREAKING: Soldier shot at War Memorial in Ottawa,unrelated
4,A small meteorite crashed into a wooded area i...,Giant 8ft 9in catfish weighing 19 stone caught...,unrelated


In [20]:
df['Body_tokenized'] = df['Body'].apply(word_tokenize)
df['Title_tokenized'] = df['Title'].apply(word_tokenize)

In [21]:
df['Body_len'] = df['Body_tokenized'].apply(len)
df['Title_len'] = df['Title_tokenized'].apply(len)


In [22]:
df['Body_freqdist'] = df['Body_tokenized'].apply(FreqDist)
df['Title_freqdist'] = df['Title_tokenized'].apply(FreqDist)

In [23]:
df['Body_unique_words'] = df['Body_freqdist'].apply(len)
df['Title_unique_words'] = df['Title_freqdist'].apply(len)

In [24]:
df.head()

Unnamed: 0,Body,Title,target,Body_tokenized,Title_tokenized,Body_len,Title_len,Body_freqdist,Title_freqdist,Body_unique_words,Title_unique_words
0,A small meteorite crashed into a wooded area i...,"Soldier shot, Parliament locked down after gun...",unrelated,"[A, small, meteorite, crashed, into, a, wooded...","[Soldier, shot, ,, Parliament, locked, down, a...",364,12,"{'A': 1, 'small': 2, 'meteorite': 5, 'crashed'...","{'Soldier': 1, 'shot': 1, ',': 1, 'Parliament'...",188,12
1,A small meteorite crashed into a wooded area i...,Tourist dubbed ‘Spider Man’ after spider burro...,unrelated,"[A, small, meteorite, crashed, into, a, wooded...","[Tourist, dubbed, ‘, Spider, Man, ’, after, sp...",364,13,"{'A': 1, 'small': 2, 'meteorite': 5, 'crashed'...","{'Tourist': 1, 'dubbed': 1, '‘': 1, 'Spider': ...",188,13
2,A small meteorite crashed into a wooded area i...,Luke Somers 'killed in failed rescue attempt i...,unrelated,"[A, small, meteorite, crashed, into, a, wooded...","[Luke, Somers, 'killed, in, failed, rescue, at...",364,10,"{'A': 1, 'small': 2, 'meteorite': 5, 'crashed'...","{'Luke': 1, 'Somers': 1, ''killed': 1, 'in': 2...",188,9
3,A small meteorite crashed into a wooded area i...,BREAKING: Soldier shot at War Memorial in Ottawa,unrelated,"[A, small, meteorite, crashed, into, a, wooded...","[BREAKING, :, Soldier, shot, at, War, Memorial...",364,9,"{'A': 1, 'small': 2, 'meteorite': 5, 'crashed'...","{'BREAKING': 1, ':': 1, 'Soldier': 1, 'shot': ...",188,9
4,A small meteorite crashed into a wooded area i...,Giant 8ft 9in catfish weighing 19 stone caught...,unrelated,"[A, small, meteorite, crashed, into, a, wooded...","[Giant, 8ft, 9in, catfish, weighing, 19, stone...",364,24,"{'A': 1, 'small': 2, 'meteorite': 5, 'crashed'...","{'Giant': 1, '8ft': 1, '9in': 1, 'catfish': 1,...",188,23
