In [None]:
#
# TRAFFIC RANK SECTION
#


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.sparse import coo_matrix

fPath = 'C:\\Users\\jacks\\OneDrive\\Documents\\BigData\\chicago-taxi-rides.csv'
taxiData = pd.read_csv(fPath)

# Removing null values from columns
neat_df = taxiData.dropna(subset=['pickup_community_area', 'dropoff_community_area'])

# Making sure everything is ints
trips = neat_df['trips']
dropoff = neat_df['dropoff_community_area'].astype(int)
pickup = neat_df['pickup_community_area'].astype(int)

# Scatterplotting
plt.figure(figsize=(10, 8))
plt.scatter(neat_df['dropoff_community_area'], neat_df['pickup_community_area'], s=neat_df['trips'] * 0.0008, alpha=0.45)
plt.title('Chicago Taxi Trips Scatterplot')
plt.ylabel('Pickup Community Area')
plt.xlabel('Dropoff Community Area')
plt.grid(True)
plt.show()





# Creating the matrix based on there being 77 Community Areas
num_com_areas = 77
beta_decay = 0.85  
iterations = 6 


tMatrix = coo_matrix((trips, (pickup - 1, dropoff - 1)), shape=(num_com_areas, num_com_areas))

traffic = tMatrix.toarray()

# Normalizing the original matrix to transfer it into the new one
rows = traffic.sum(axis=1)
tempMatrix = np.divide(traffic, rows[:, np.newaxis], where=rows[:, np.newaxis] != 0)

# CReating the rank vector with equal probabilities to start
rankVector = np.ones(num_com_areas) / num_com_areas
ranks = [rankVector]

# Iterate through 0-6 times
for _ in range(iterations):
    rvNew = beta_decay * np.dot(rank_vector, tempMatrix) + (1 - beta_decay) / num_com_areas
    ranks.append(rvNew)
    rank_vector = rvNew

# Transfer it into a dataframe to be able to view the ranks
rank_df = pd.DataFrame(ranks, columns=[f'Area: {i+1}' for i in range(num_com_areas)], index=[f'Iteration: {i}' for i in range(iterations + 1)])
print(rank_df)


In [None]:
#
# TEXT PROCESSING SECTION
#

import os
import tarfile
import glob
import requests
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

stopwords_list = requests.get("https://gist.githubusercontent.com/rg089/35e00abf8941d72d419224cfd5b5925d/raw/12d899b70156fd0041fa9778d657330b024b959c/stopwords.txt").content
stopwords = set(stopwords_list.decode().splitlines())
stopwords = list(stopwords)

def extract(tarGZ, ePath):
    # Pulls the speech files out of the folder
    with tarfile.open(tarGZ, "r:gz") as tar:
        tar.extractall(path=ePath)

def load(ePath):
    # Create an array of strings to store eac speech in (one string per speech)
    speeches = []
    for fPath in glob.glob(os.path.join(ePath, '**', '*.txt'), recursive=True):
        with open(fPath, 'r', encoding='utf-8') as item:
            sText = item.read().strip()
            speeches.append(sText)
    return speeches

def find_words(speeches):
    # Using the Vectorizer library

    # Removes common words like a, an, the, is, etc. from consideration
    vectorizer = TfidfVectorizer(stop_words = stopwords)

    # Additional way that common words can be removed from consideration
    # vectorizer = TfidfVectorizer(stop_words = 'english')


    tfidfs = vectorizer.fit_transform(speeches)  

    sum_total = np.sum(tfidfs.toarray(), axis=0)
    words = vectorizer.get_feature_names_out()

    # Put words with their tfidf scores
    scores = list(zip(words, sum_total))

    # Sorting by score
    sorted_words = sorted(scores, key=lambda x: x[1], reverse=True)

    # Print top 15 words
    print(f"TF-IDF Top 15 Scores:")
    for word, score in sorted_words[:15]:
        print(f"{word}: {score:.4f}")

def main(tar_gz_path, extract_path):
    extract(tar_gz_path, extract_path)
    find_words(load(extract_path))

# Specify here whose speeches you want to access
pres_name = 'clinton'

tarGZ = f'C:\\Users\\jacks\\OneDrive\\Documents\\BigData\\prez_speeches\\{pres_name}.tar.gz'
ePath = f'C:\\Users\\jacks\\OneDrive\\Documents\\BigData\\prez_speeches\\{pres_name}'

# Runs all the functions
main(tarGZ, ePath)