In [1]:
import spacy
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import PyPDF2
import re



In [None]:
# Function to read a document and extract text and year
def read_document(file_path):
    # Extract year from filename if it exists (assuming year is 4 digits)
    year_match = re.search(r'(\d{4})', file_path)
    year = year_match.group(1) if year_match else None
    
    if file_path.endswith('.pdf'): 
        with open(file_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            text = ''
            for page in reader.pages:
                text += page.extract_text()
    elif file_path.endswith('.txt'):
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
    else:
        raise ValueError("Non Existant file or Unsupported file format. Please provide a .pdf or .txt file.")
    
    return text, year

In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [None]:
# # simple function that uses spacy to extract entities and returns a table from a document that consists the following colums: token, frequency, pos_tags,lemma, ner,stem, tense,label,suffix and year of the most common 100 tokens being appended to the dataframe
# def extract_entities(text, year):
#     doc = nlp(text)
#     data = {
#         "token": [],
#         "frequency": [],
#         "pos_tags": [],
#         "lemma": [],
#         "ner": [],
#         "stem": [],
#         "tense": [],
#         "label": [],
#         "suffix": [],
#         "year": []
#     }
    
#     for token in doc:
#         if not token.is_stop and not token.is_punct:
#             data["token"].append(token.text)
#             data["frequency"].append(token.prob)
#             data["pos_tags"].append(token.pos_)
#             data["lemma"].append(token.lemma_)
#             data["ner"].append(token.ent_type_)
#             data["stem"].append(token._.stem)
#             data["tense"].append(token._.tense)
#             data["label"].append(token.dep_)
#             data["suffix"].append(token._.suffix)
#             data["year"].append(year)

#     df = pd.DataFrame(data)
#     df = df.groupby("token").agg({
#         "frequency": "sum",
#         "pos_tags": "first",
#         "lemma": "first",
#         "ner": "first",
#         "stem": "first",
#         "tense": "first",
#         "label": "first",
#         "suffix": "first",
#         "year": "first"
#     }).reset_index()
    
#     df = df.sort_values(by="frequency", ascending=False).head(100)
    
#     return df
    

In [50]:
# simple function that uses spacy to extract entities and returns a table from a document that consists the following colums: token, frequency, pos_tags,lemma, ner,stem, tense,label,suffix and year of the docuement
def extract_entities(text, year):
    doc = nlp(text)
    data = {
        "token": [],
        "frequency": [],
        "pos_tags": [],
        "lemma": [],
        "ner": [],
        "stem": [],
        "tense": [],
        "label": [],
        "suffix": [],
        "year": []
    }
    
    for token in doc:
        if  token.is_stop or  token.is_punct:
            continue
        data["token"].append(token.text)
        data["frequency"].append(doc.count_by(token.i))
        data["pos_tags"].append(token.pos_)
        data["lemma"].append(token.lemma_)
        data["ner"].append(token.ent_type_)
        data["stem"].append(token._.stem if hasattr(token._, 'stem') else None)
        data["tense"].append(token.tag_)
        data["label"].append(token.dep_)
        data["suffix"].append(token.suffix_)
        data["year"].append(year)
    
    df = pd.DataFrame(data)
    return df

In [51]:
scarlet, year = read_document('scarlet_letter_1850.txt')
len(scarlet)

499522

In [None]:
# Simple function to visualize the frequency of tokens in a document
def visualize_token_frequency(df):
    plt.figure(figsize=(12, 6))
    sns.countplot(data=df, x='token', order=df['token'].value_counts().index)
    plt.xticks(rotation=90)
    plt.title('Token Frequency')
    plt.xlabel('Tokens')
    plt.ylabel('Frequency')
    plt.show()

In [None]:
# A simple function to show word usage over different documents which are written with in a span of years
def visualize_word_usage_over_years(df, year_column='year', word_column='token'):
    plt.figure(figsize=(12, 6))
    sns.countplot(data=df, x=year_column, hue=word_column)
    plt.title('Word Usage Over Years')
    plt.xlabel('Year')
    plt.ylabel('Count')
    plt.legend(title=word_column)
    plt.show()

In [None]:
# Function to visualize the distribution for parts of speech, named entities, frequency in a document
def visualize_pos_and_ner_distribution(df):
    plt.figure(figsize=(12, 6))
    
    # POS Distribution
    plt.subplot(1, 2, 1)
    sns.countplot(data=df, x='pos_tags', order=df['pos_tags'].value_counts().index)
    plt.title('Part of Speech Distribution')
    plt.xlabel('POS Tags')
    plt.ylabel('Count')
    plt.xticks(rotation=45)
    
    # NER Distribution
    plt.subplot(1, 2, 2)
    sns.countplot(data=df, x='ner', order=df['ner'].value_counts().index)
    plt.title('Named Entity Recognition Distribution')
    plt.xlabel('NER Tags')
    plt.ylabel('Count')
    plt.xticks(rotation=45)
    
    plt.tight_layout()
    plt.show()

In [None]:
# A combined main function that takes in a list of file paths, reads the documents, extracts entities, and visualizes the results
def main(file_paths):
    all_data = []
    
    for file_path in file_paths:
        text, year = read_document(file_path)
        df = extract_entities(text, year)
        all_data.append(df)
        
        visualize_token_frequency(df)
        visualize_pos_distribution(df)
        visualize_ner_distribution(df)
    
    combined_df = pd.concat(all_data, ignore_index=True)
    return combined_df
# Example usage


In [None]:
ls

assignment.ipynb             spacy.ipynb
Martian_chronicles_1950.pdf  The_Vanishing_Half_2020.pdf
scarlet_letter_1850.txt      tristram_shandy_1759.txt
sister_carrie_1900.txt       White_Teeth_2000.pdf


In [None]:
# Usage of the main function 
if __name__ == "__main__":
    file_paths = ['scarlet_letter_1850.txt']  # Replace with your file paths
    combined_df = main(file_paths)
    print(combined_df.head())
