# Assignment 1

The task:

1. Loop over the each text in the in folder
2. Extract the following information
    - Relative frequenct of Nouns, Verbs, Adjective and Adverbs per 10.000 words
    - total number of unique PER, LOC, ORGS
3. for each sub-folder (a1,a2,a3,..) save a table which shows the following information

In [2]:
import os
import pandas as pd
import glob
import spacy
import re

In [12]:
# Define the spacy model
nlp = spacy.load("en_core_web_md")

# Define the filepath
filepath = os.path.join(
                        "..",
                        "in",
                        "USEcorpus"
                        )


# Loop over all 14 subfolders
for subfolder in sorted(os.listdir(filepath)):
    subfolder_path = os.path.join(filepath, subfolder)

    if os.path.isdir(subfolder_path):

        # Create empty lists to store data  
        filenames = []
        nouns_freq = []
        verbs_freq = []
        adverbs_freq = []
        adjectives_freq = []
        no_unique_per = []
        no_unique_org = []
        no_unique_loc = []

    # Loop over each text file in the subfolder
        for file in glob.glob(os.path.join(subfolder_path, "*.txt")):
            if os.path.isfile(file): 
                with open(file, "r", encoding = "latin-1") as f:
                    text = f.read() # Read in text
                    text = re.sub(r"<*?>", "", text) # Remove extra information between the <>
                    doc = nlp(text) # Create spacy doc
            
                    ## Count number of each POS ##

                    # Number of nouns
                    nouns_count = 0
                    for token in doc:
                        if token.pos_ == "NOUN":
                            nouns_count += 1

                    # Number of verbs
                    verbs_count = 0
                    for token in doc:
                        if token.pos_ == "VERB":
                            verbs_count += 1

                    # Number of adverbs
                    adverb_count = 0
                    for token in doc:
                        if token.pos_ == "ADV":
                            adverb_count += 1

                    # Number of adjectives
                    adjective_count = 0
                    for token in doc:
                        if token.pos_ == "ADJ":
                            adjective_count += 1
                
                    # Calculate the relative frequency per 10,000 words
                    nouns_relative_freq = round(((nouns_count/len(doc)) * 10000),2)
                    verbs_relative_freq = round(((verbs_count/len(doc)) * 10000),2)
                    adverb_relative_freq = round(((adverb_count/len(doc)) * 10000),2)
                    adjective_relative_freq = round(((adjective_count/len(doc)) * 10000),2)
                    

                    # Make a function for calculating the NER
                    def unique_NE(doc):

                        # Make empty list fot the NER
                        enteties = []
                        # Go through all enteties (the word and their label) in the doc and append it to the enteties list
                        for e in doc.ents: 
                            enteties.append((e.text, e.label_))
                        # Put it in a pandas dataframe
                        ents_pd = pd.DataFrame(enteties, columns=["ent", "label"])
                        # Remove all duplicates since we want unique NER
                        ents_pd = ents_pd.drop_duplicates()
                        # Count appearances based on their label
                        unique_counts = ents_pd.value_counts(subset = "label")
                        # Define the labels we want
                        unique_labels = ['PERSON', 'LOC', 'ORG']
                        # Make an empty list to be filled with unique labels
                        unique_row = []
                    
                        for label in unique_labels:
                            if label in (unique_counts.index): # For each label (person, loc and org), if the label is in unique_counts dataframe:
                                unique_row.append(unique_counts[label]) # Append the label to the unique_row list
                            else:
                                unique_row.append(0)

                        return unique_row
                    
                    # Apply the function the doc
                    unique_per_count, unique_loc_count, unique_org_count = unique_NE(doc)

                    # Append data to the created lists
                    filenames.append(os.path.basename(file))
                    nouns_freq.append(nouns_relative_freq)
                    verbs_freq.append(verbs_relative_freq)
                    adverbs_freq.append(adverb_relative_freq)
                    adjectives_freq.append(adjective_relative_freq)
                    no_unique_per.append(unique_per_count)
                    no_unique_org.append(unique_org_count)
                    no_unique_loc.append(unique_loc_count)


    # Create a pandas dataframe for each subfolder
    df = pd.DataFrame({
        "Filename": filenames, 
        "Nouns_Relative_Freq": nouns_freq,
        "Verbs_Relative_Freq": verbs_freq,
        "Adverbs_Relative_Freq": adverbs_freq,
        "Adjectives_Relative_Freq": adjectives_freq,
        "No_unique_per": no_unique_per,
        "No_unique_loc": no_unique_loc,
        "No_unique_org": no_unique_org
    })

    df = df.sort_values("Filename")


    # Save the DataFrame as a CSV file in the out folder
    csv_filename = f"../out/{subfolder}_data.csv"
    df.to_csv(csv_filename, index=False)