# Assignment 1

In [6]:
# First, install spacy, pandas, and the language model in the terminal, by typing in:
# pip install spacy pandas
# python -m spacy download en_core_web_md

In [1]:
import spacy
import pandas as pd
import os
import re

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
nlp = spacy.load("en_core_web_md")

In [3]:
# create a path to the corpus containing our data and then sort the directory
main_folder_path = "../input/USEcorpus/"
sorted_dir = sorted(os.listdir(main_folder_path))

In [4]:
# creating a for loop going into each subfolder in the corpus and opening all text files within the subfolders
for folder in sorted_dir:
    folder_path = os.path.join(main_folder_path, folder)
    filenames = sorted(os.listdir(folder_path))
    # creating a list for every folder's info to be gathered in. will be used to create dataframes later
    folder_info = []
    
    for text_file in filenames:
        file_path = folder_path + "/" + text_file
        
        with open (file_path, encoding="latin-1") as file:
            text = file.read()
            # removing the metadata and document ID inside <>
            text = re.sub(r'<.+?>', '', text)
            # add text files to a doc object
            doc = nlp(text)
            # create counters for nouns, verbs, adjectives, and adverbs.
            noun_count = 0
            verb_count = 0
            adj_count = 0
            adv_count = 0

            # with a for loop, add 1 to the counter every time that part of speech appears in the doc object
            for token in doc:
                if token.pos_ == "NOUN":
                    noun_count += 1
                elif token.pos_ == "VERB":
                    verb_count += 1
                elif token.pos_ == "ADJ":
                    adj_count += 1
                elif token.pos_ == "ADV":
                    adv_count += 1
            
            # relative frequencies of nouns, verbs, adjectives, and adverbs per 10,000 words, rounded up to 2 decimals        
            relative_freq_noun = round((noun_count/len(doc)) * 10000, 2)
            relative_freq_verb = round((verb_count/len(doc)) * 10000, 2)
            relative_freq_adj = round((adj_count/len(doc)) * 10000, 2)
            relative_freq_adv = round((adv_count/len(doc)) * 10000, 2)
            
            # extracting named entities for persons, locations, and organizations
            persons = set()
            for ent in doc.ents:
                    if ent.label_ == 'PERSON':
                        persons.add(ent.text)
            num_persons = len(persons)
            
            locations = set()
            for ent in doc.ents:
                    if ent.label_ == 'LOC':
                        locations.add(ent.text)
            num_locations = len(locations)
            
            organisations = set()
            for ent in doc.ents:
                    if ent.label_ == 'ORG':
                        organisations.add(ent.text)
            num_organisations = len(organisations)
            
            # create list for every file
            file_info = [text_file, relative_freq_noun, relative_freq_verb, relative_freq_adj, relative_freq_adv, num_persons, num_locations, num_organisations]
            # append the file's info to the collected list for the whole folder's info
            folder_info.append(file_info)
    
        # creating a dataframe with pandas using folder_info
        # make one per subfolder
        df = pd.DataFrame(folder_info,
                    columns=["Filename", "RelFreq NOUN", "RelFreq VERB", "RelFreq ADJ", "RelFreq ADV", "Unique PER", "Unique LOC", "Unique ORG"])
        
        # upload dataframe to output folder
        outpath = os.path.join("..", "output", folder + ".csv")
        df.to_csv(outpath)
        

In [8]:
# view any subfolder's output as a pandas dataframe by switching out the filename for a different .csv file
pd.read_csv("../output/a1.csv")

Unnamed: 0.1,Unnamed: 0,Filename,RelFreq NOUN,RelFreq VERB,RelFreq ADJ,RelFreq ADV,Unique PER,Unique LOC,Unique ORG
0,0,0100.a1.txt,1520.22,1213.39,794.98,529.99,0,0,0
1,1,0101.a1.txt,1158.16,1232.88,585.31,834.37,1,0,0
2,2,0102.a1.txt,1474.44,1189.06,677.76,475.62,1,0,0
3,3,0103.a1.txt,1090.31,1354.63,594.71,572.69,1,0,1
4,4,0104.a1.txt,1311.27,1188.73,563.73,674.02,0,1,2
...,...,...,...,...,...,...,...,...,...
298,298,4045.a1.txt,1542.86,1047.62,742.86,380.95,0,0,0
299,299,5005.a1.txt,1307.30,1443.12,679.12,662.14,0,0,0
300,300,5015.a1.txt,1172.41,1218.39,574.71,436.78,0,0,0
301,301,5022.a1.txt,1412.30,1252.85,501.14,728.93,2,0,0
