In [1]:
import pandas as pd

In [60]:
'''
Loading Data from Local Computer
Each author is a subfolder, and within each folder is a series of .txt files
The goal of this cell is to load all the contents of every subfolder into the 
DataFrame, while retaining the author designation for those works.
'''

from os import listdir

def multiple_file_load(file_directory):
    
    # identifying all author subfolders - appending them into list 
    
    authorlist = []
    textlist = []
    
    for author in listdir(file_directory):
        authorname = str(author)
        author_sub_directory = (file_directory + '/' + author) #author file path
    
    # identifying all files within each subfolder - 
    
        for filename in listdir(author_sub_directory):
            text_file_path = (author_sub_directory + '/' + filename) # text file path
            
            if (filename.lower().endswith('txt')):
                authorlist.append(authorname)
                textfile = open(text_file_path,'r') # this is how you open files
                substantive_text = textfile.read()  # this is how to read a file
                textlist.append(substantive_text)   # this is how to do something with that file
                textfile.close()                    # this is how to close the file 
                                                             # (you must close one before opening another!)
  # pushing the two lists into a dataframe 

    df = pd.DataFrame({'Author':authorlist, 'Text':textlist})
    
    return df
                

In [61]:
# loading training data (note the file path)
df_train = multiple_file_load('D:/Github/Data-Science-Bootcamp/CAPSTONE - Unsupervised Learning/C50/C50train')

In [82]:
df_train.head()

Unnamed: 0,Author,Text,Word Count,Character Count
0,AaronPressman,The Internet may be overflowing with new techn...,319,2009
1,AaronPressman,The U.S. Postal Service announced Wednesday a ...,416,2604
2,AaronPressman,Elementary school students with access to the ...,72,491
3,AaronPressman,An influential Internet organisation has backe...,463,2907
4,AaronPressman,An influential Internet organisation has backe...,356,2299


In [83]:
import re

In [84]:
# adding the space in the authors...because I want it

author_split = [re.findall('[A-Z][a-z]*', i) for i in df_train.Author]

In [85]:
author_split[:5]

[['Aaron', 'Pressman'],
 ['Aaron', 'Pressman'],
 ['Aaron', 'Pressman'],
 ['Aaron', 'Pressman'],
 ['Aaron', 'Pressman']]

In [86]:
author_join = []

for couple in author_split:
    joined_string = couple[0] + ' ' + couple[1]
    author_join.append(joined_string)    

In [87]:
df_train['Author'] = pd.Series(author_join)
df_train.tail()

Unnamed: 0,Author,Text,Word Count,Character Count
2495,William Kazer,China's central bank chief has said that infla...,404,2425
2496,William Kazer,"China ushered in 1997, a year it has hailed as...",632,3900
2497,William Kazer,China issued tough new rules on the handling o...,330,1985
2498,William Kazer,China will avoid bold moves in tackling its ai...,524,3085
2499,William Kazer,Communist Party chief Jiang Zemin has put his ...,636,3819


In [88]:
# adding word count to df_train

df_train['Character Count'] = pd.Series([len(i) for i in df_train['Text']])
df_train['Word Count'] = pd.Series([len(i.split()) for i in df_train['Text']])
df_train.head()

Unnamed: 0,Author,Text,Word Count,Character Count
0,Aaron Pressman,The Internet may be overflowing with new techn...,319,2009
1,Aaron Pressman,The U.S. Postal Service announced Wednesday a ...,416,2604
2,Aaron Pressman,Elementary school students with access to the ...,72,491
3,Aaron Pressman,An influential Internet organisation has backe...,463,2907
4,Aaron Pressman,An influential Internet organisation has backe...,356,2299


In [74]:
# checking for duplicates

s1 = df_train['Text'][3]
s2 = df_train['Text'][4]
print(s1)
print('\n', s2) # ctrl + f'ing the differences - there appear to be differences (word count and character count support that)
                              

An influential Internet organisation has backed away from a proposal to dramatically expand the number of addresses available on the global computer network.
The Internet Society, which helps develop and coordinate Internet standards, announced this week that instead of moving ahead with the proposal it would form a nine-member committee to study the issue.
Unveiled in August, the proposal was an attempt to quell the growing number of disputes over desirable address names.
"There just seems to be no consensus at all," said Don Heath, president of the society. A variety of objections and questions have been raised about the August proposal, from technical networking issues to concerns about protection for trademarks and intellectual property.
"We thought we should try to pull it all together and have a good cross-section of people to discuss it and see if we can't reach a consensus," he said.
The new committee ought to finish its work by early next year and new names could be doled out 

In [78]:
# checking these counts
import collections

c = collections.Counter(df_train['Character Count'])
print(c.most_common(5))

[(2802, 6), (3172, 5), (2718, 5), (3225, 5), (2639, 5)]


In [89]:
df_check = df_train[(df_train['Character Count'] == 2802) | (df_train['Character Count'] == 3172) | 
                   (df_train['Character Count'] == 2718) | (df_train['Character Count'] == 3225)].copy()

df_check = df_check.sort_values(by='Character Count', ascending=False)
df_check.head(20)

Unnamed: 0,Author,Text,Word Count,Character Count
1357,Lynne O,Asian traders raised their eyebrows on Monday ...,553,3225
39,Aaron Pressman,The various sectors of the U.S. financial serv...,482,3225
170,Benjamin Kang,China called on Taiwan on Tuesday to show good...,512,3225
1890,Peter Humphrey,Britain's Prince Andrew has arrived in Hong Ko...,554,3225
669,Jane Macartney,If foreign governments thought Beijing listene...,538,3225
1058,Keith Weir,"Marjorie Scardino, set to become Pearson chief...",554,3172
1888,Peter Humphrey,China and Britain moved closer on Friday to re...,511,3172
19,Aaron Pressman,The Federal Reserve took another step Friday t...,497,3172
815,Joe Ortiz,The world's major banks are busy jockeying for...,518,3172
450,Eric Auchard,"Computer Associates International Inc., a lead...",480,3172


In [None]:
# okay! no duplicates! yay!
# could have more easily done pd.drop_duplicates, but whatever man