In [34]:
import glob 
import pandas as pd
import numpy as np

In [43]:
word_per_line = 100

## Wikipedia dataset

In [52]:
directory = "wikipedia_text/"
sets = ["chinese_dynasties/", "constellations/", "indian_states/", "languages/", "pokemon_characters/"]

all_titles = []
all_split_passages = []

for s in sets: 
    print("processing passages in set " + s)
    path = directory + s + "*.txt"
    pre_len = len(directory + s)
    files = glob.glob(path)
    for fname in files: 
        with open(fname, "r") as f:
            lines = [line.strip() for line in f if len(line.strip())>0]
        passage = "".join(lines).split()
        s = len(passage)//word_per_line if len(passage)%word_per_line==0 else len(passage)//100+1
        split_passages = [" ".join(passage[i*word_per_line:(i+1)*word_per_line]) for i in range(s)]
        
        title = fname[pre_len:-4].replace("_", " ")
        titles = [title] * len(split_passages)
        all_titles.extend(titles)
        all_split_passages.extend(split_passages)
        print("\t" + fname + " done")

processing passages in set chinese_dynasties/
	wikipedia_text/chinese_dynasties/Han_dynasty.txt done
	wikipedia_text/chinese_dynasties/Northern_Wei.txt done
	wikipedia_text/chinese_dynasties/Xia_dynasty.txt done
	wikipedia_text/chinese_dynasties/Liu_Song_dynasty.txt done
	wikipedia_text/chinese_dynasties/Jin_dynasty_(266–420).txt done
	wikipedia_text/chinese_dynasties/Tang_dynasty.txt done
	wikipedia_text/chinese_dynasties/Xin_dynasty.txt done
	wikipedia_text/chinese_dynasties/Liang_dynasty.txt done
	wikipedia_text/chinese_dynasties/Shang_dynasty.txt done
	wikipedia_text/chinese_dynasties/Zhou_dynasty.txt done
	wikipedia_text/chinese_dynasties/Sui_dynasty.txt done
	wikipedia_text/chinese_dynasties/Chen_dynasty.txt done
processing passages in set constellations/
	wikipedia_text/constellations/aquarius_constellation.txt done
	wikipedia_text/constellations/virgo_constellation.txt done
	wikipedia_text/constellations/capricornus_constellation.txt done
	wikipedia_text/constellations/gemini_c

In [53]:
assert len(all_split_passages) == len(all_titles)

ids = range(1, len(all_split_passages)+1)

In [54]:
processed_df1 = pd.DataFrame(columns=['id', 'text', 'title'], data=np.array([ids, all_split_passages, all_titles]).T)
processed_df1

Unnamed: 0,id,text,title
0,1,"The Han dynasty (UK: , US: ; Chinese: 漢朝; piny...",Han dynasty
1,2,AD) and the Eastern Han (25–220 AD). Spanning ...,Han dynasty
2,3,divided into areas directly controlled by the ...,Han dynasty
3,4,during the Zhou dynasty (c. 1050–256 BC). The ...,Han dynasty
4,5,"negative numbers in mathematics, the raised-re...",Han dynasty
...,...,...,...
3237,3238,"the most important Pokémon of all time."" In 20...",pikachu pokemon
3238,3239,as one of the twelve most influential video ga...,pikachu pokemon
3239,3240,the most irritating 1990s cartoon characters. ...,pikachu pokemon
3240,3241,Maria. The Official Pokémon Handbook. Scholast...,pikachu pokemon


In [55]:
processed_df1.to_csv("processed_wikipedia_passages_1.tsv", sep='\t', index=False)

## Prototype starter data

In [56]:
directory = "prototype_starter/data/"
sets = ["set1/", "set2/", "set3/"]

all_titles = []
all_split_passages = []

for s in sets: 
    print("processing passages in set " + s)
    path = directory + s + "*.txt"
    files = glob.glob(path)
    for fname in files: 
        with open(fname, "r") as f:
            lines = [line.strip() for line in f if len(line.strip())>0]
        passage = "".join(lines[1:]).split()
        s = len(passage)//word_per_line if len(passage)%word_per_line==0 else len(passage)//100+1
        split_passages = [" ".join(passage[i*word_per_line:(i+1)*word_per_line]) for i in range(s)]

        titles = [lines[0]] * len(split_passages)
        all_titles.extend(titles)
        all_split_passages.extend(split_passages)
        print("\t" + fname + " done")

processing passages in set set1/
	prototype_starter/data/set1/a1.txt done
	prototype_starter/data/set1/a2.txt done
	prototype_starter/data/set1/a3.txt done
	prototype_starter/data/set1/a7.txt done
	prototype_starter/data/set1/a6.txt done
	prototype_starter/data/set1/a4.txt done
	prototype_starter/data/set1/a5.txt done
	prototype_starter/data/set1/a8.txt done
	prototype_starter/data/set1/a9.txt done
	prototype_starter/data/set1/a10.txt done
processing passages in set set2/
	prototype_starter/data/set2/a1.txt done
	prototype_starter/data/set2/a2.txt done
	prototype_starter/data/set2/a3.txt done
	prototype_starter/data/set2/a7.txt done
	prototype_starter/data/set2/a6.txt done
	prototype_starter/data/set2/a4.txt done
	prototype_starter/data/set2/a5.txt done
	prototype_starter/data/set2/a8.txt done
	prototype_starter/data/set2/a9.txt done
	prototype_starter/data/set2/a10.txt done
processing passages in set set3/
	prototype_starter/data/set3/a1.txt done
	prototype_starter/data/set3/a2.txt do

In [57]:
assert len(all_split_passages) == len(all_titles)

ids = range(1, len(all_split_passages)+1)

In [58]:
processed_df2 = pd.DataFrame(columns=['id', 'text', 'title'], data=np.array([ids, all_split_passages, all_titles]).T)
processed_df2

Unnamed: 0,id,text,title
0,1,"Clinton Drew ""Clint"" Dempsey /ˈdɛmpsi/ (born M...",Clint Dempsey
1,2,"initially by a jaw injury, he would eventually...",Clint Dempsey
2,3,"international goals, making him the nation's s...",Clint Dempsey
3,4,local Mexican-dominated adult league. Dempsey ...,Clint Dempsey
4,5,"him to rejoin the club.On November 27, 1995, D...",Clint Dempsey
...,...,...,...
1717,1718,are derived from European languages. Both infu...,Esperanto
1718,1719,"derivations should be used whenever possible, ...",Esperanto
1719,1720,existing languages; as well as being arguably ...,Esperanto
1720,1721,much.ModificationsThough Esperanto itself has ...,Esperanto


In [59]:
processed_df2.to_csv("processed_wikipedia_passages_2.tsv", sep='\t', index=False)

## Two DF together

In [65]:
all_df = processed_df1.append(processed_df2, ignore_index=True).drop("id", axis=1).reset_index().rename({"index": "id"}, axis=1)
all_df

Unnamed: 0,id,text,title
0,0,"The Han dynasty (UK: , US: ; Chinese: 漢朝; piny...",Han dynasty
1,1,AD) and the Eastern Han (25–220 AD). Spanning ...,Han dynasty
2,2,divided into areas directly controlled by the ...,Han dynasty
3,3,during the Zhou dynasty (c. 1050–256 BC). The ...,Han dynasty
4,4,"negative numbers in mathematics, the raised-re...",Han dynasty
...,...,...,...
4959,4959,are derived from European languages. Both infu...,Esperanto
4960,4960,"derivations should be used whenever possible, ...",Esperanto
4961,4961,existing languages; as well as being arguably ...,Esperanto
4962,4962,much.ModificationsThough Esperanto itself has ...,Esperanto


In [66]:
all_df.to_csv("processed_wikipedia_passages_both.tsv", sep='\t', index=False)