In [1]:
import pandas as pd

In [2]:
# company_num = BOOKS
# link_num = CHAPTERS
# text = PARAS

OHCO = ['company_num', 'link_num', 'sent_num', 'token_num']

### F0

#### Source Format. The initial source format of a text, which varies by collection, e.g. XML (e.g. TEI and RSS), HTML, plain text (e.g. Gutenberg), JSON, and CSV.

In [3]:
# df = pd.read_csv("CORPUS.csv", lineterminator='\n')
df = pd.read_csv('CORPUS.tar.gz', compression='gzip', lineterminator='\n')
df

Unnamed: 0,company_num,Text,characters
0,0,"Ahresty, with more than 60 years of experienc...",1709
1,0,"PRODUCTS Ahresty, with more than 60 years of e...",754
2,0,ENVIRONMENTAL,16
3,0,CONTACT Address Ahresty Wilmington Corporation...,439
4,1,Manufacturer ofMetal FastenersandGeneral Hardw...,1025
...,...,...,...
90628,1225,"Home•Careers Together, we build the future We...",2524
90629,1225,Privacy The protection of your personal data i...,12706
90630,1225,Signicast acquires European based CIREX 02.15....,5160
90631,1225,Email Protection You are unable to access this...,558


In [4]:
# Since this CORPUS is too big, I only included certian # of companies. 
# Otherwise, it crashes when running sentence seperator cell. ( > 300 companies)
# tokenization doesn't crash when we limit to 200 companies.
df = df[df["company_num"] < 200]

### F1

#### Machine Learning Corpus Format (MLCF). Ideally a table of minimum discursive units indexed by document content hierarchy.

In [5]:
# Add link count column
df['link_num'] = df.groupby('company_num').cumcount()

DOCS = df[["company_num", "link_num" ,"Text", "characters"]]
DOCS = DOCS.rename(columns={'Text': 'text'})
DOCS = DOCS.set_index(["company_num"])
DOCS

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['link_num'] = df.groupby('company_num').cumcount()


Unnamed: 0_level_0,link_num,text,characters
company_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0,"Ahresty, with more than 60 years of experienc...",1709
0,1,"PRODUCTS Ahresty, with more than 60 years of e...",754
0,2,ENVIRONMENTAL,16
0,3,CONTACT Address Ahresty Wilmington Corporation...,439
1,0,Manufacturer ofMetal FastenersandGeneral Hardw...,1025
...,...,...,...
198,15,Phone:+55 41 3341 1900 Sitemap Coming Soon… He...,576
198,16,Phone:+55 41 3341 1900 Author:Daniel WHB Autom...,3545
199,0,Committed toQuality MaterialsQuality Workmansh...,332
199,1,Our Services Specializing in: Sandblasting San...,809


### F2

#### Standard Text Analytic Data Model (STADM). A normalized set of tables including DOC, TOKEN, and TERM tables. Produced by the tokenization of F1 data.

In [6]:
CHAPS = DOCS.reset_index().set_index(["company_num", "link_num"])
CHAPS

Unnamed: 0_level_0,Unnamed: 1_level_0,text,characters
company_num,link_num,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0,"Ahresty, with more than 60 years of experienc...",1709
0,1,"PRODUCTS Ahresty, with more than 60 years of e...",754
0,2,ENVIRONMENTAL,16
0,3,CONTACT Address Ahresty Wilmington Corporation...,439
1,0,Manufacturer ofMetal FastenersandGeneral Hardw...,1025
...,...,...,...
198,15,Phone:+55 41 3341 1900 Sitemap Coming Soon… He...,576
198,16,Phone:+55 41 3341 1900 Author:Daniel WHB Autom...,3545
199,0,Committed toQuality MaterialsQuality Workmansh...,332
199,1,Our Services Specializing in: Sandblasting San...,809


In [7]:
CHAPS["text"]

company_num  link_num
0            0            Ahresty, with more than 60 years of experienc...
             1           PRODUCTS Ahresty, with more than 60 years of e...
             2                                            ENVIRONMENTAL   
             3           CONTACT Address Ahresty Wilmington Corporation...
1            0           Manufacturer ofMetal FastenersandGeneral Hardw...
                                               ...                        
198          15          Phone:+55 41 3341 1900 Sitemap Coming Soon… He...
             16          Phone:+55 41 3341 1900 Author:Daniel WHB Autom...
199          0           Committed toQuality MaterialsQuality Workmansh...
             1           Our Services Specializing in: Sandblasting San...
             2           Contact No appointment needed – give us a call...
Name: text, Length: 7573, dtype: object

In [8]:
%%time
sent_pat = r'[.?!;:]+'
SENTS = CHAPS['text'].str.split(sent_pat, expand=True).stack().to_frame('sent_str')
SENTS.index.names = ["company_num", "link_num", "sent_num"]

CPU times: user 1.08 s, sys: 108 ms, total: 1.19 s
Wall time: 1.19 s


In [9]:
SENTS

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sent_str
company_num,link_num,sent_num,Unnamed: 3_level_1
0,0,0,"Ahresty, with more than 60 years of experienc..."
0,0,1,Industry leadingmanufacturing technology Ahre...
0,0,2,We utilize leak testing on all machine lines...
0,0,3,We currently have 28 fully automated High Pre...
0,0,4,Global leaderhere at home The Ahresty Wilming...
...,...,...,...
199,2,1,Hours
199,2,2,Monday-Friday 7
199,2,3,30 am - 4
199,2,4,00 pm 618-753-3188 Web Design by Novel Designs...


In [10]:
token_pat = r"[\s',-]+"
TOKENS = SENTS['sent_str'].str.split(token_pat, expand=True).stack()\
    .to_frame('token_str')
TOKENS

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,token_str
company_num,link_num,sent_num,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0,0,0,
0,0,0,1,Ahresty
0,0,0,2,with
0,0,0,3,more
0,0,0,4,than
...,...,...,...,...
199,2,5,3,byElegant
199,2,5,4,Themes|
199,2,5,5,Powered
199,2,5,6,byWordPress
