In [3]:
# main.py

import pandas as pd

from bs4 import BeautifulSoup
from urllib.request import urlopen, Request
from concurrent.futures import ThreadPoolExecutor
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
import transcript  # Import functions from transcript.py

# 1. Read the CSV file containing player interview data
csv_file = "interviews.csv"
df = pd.read_csv(csv_file)

# 2. Scrape player stats from the hockeydb website
url = "https://www.hockeydb.com/ihdb/draft/nhl2016e.html"
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
                         'AppleWebKit/537.36 (KHTML, like Gecko) '
                         'Chrome/58.0.3029.110 Safari/537.3'}
req = Request(url, headers=headers)
html = urlopen(req)
soup = BeautifulSoup(html, 'html.parser')
table = soup.find('table')

# Extract headers from the table (skip header row)
table_headers = [th.text.strip() for th in table.find_all('tr')[1].find_all('th')]
data = []
# Extract data rows from the table (starting from the third row)
for row in table.find_all('tr')[2:]:
    row_data = [cell.text.strip() for cell in row.find_all('td')]
    data.append(row_data)
# Create a DataFrame with the player stats
player_stats = pd.DataFrame(data, columns=table_headers)

# 3. Prepare a list of player data tuples (playerName, youtube_url) from the CSV
player_data_list = [(row['playerName'], row['url']) for _, row in df.iterrows()]

# 4. Process the player data concurrently using functions from transcript.py
with ThreadPoolExecutor() as executor:
    processed_results = list(executor.map(transcript.process_player_data, player_data_list))

# Filter out any None results from failed processing
processed_results = [result for result in processed_results if result is not None]

# 5. If valid data exists, merge and process it further
if processed_results:
    final_df = pd.concat(processed_results, ignore_index=True)
    
    # Merge with player_stats DataFrame on the player name columns
    merged_df = pd.merge(final_df, player_stats, left_on='PlayerName', right_on='Player', how='inner')
    
    # Define features (using 'Answers' as text) and target (e.g., 'GP' for games played)
    X = merged_df['Answers']
    y = merged_df['GP'].astype(int)  # Ensure target is integer
    
    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # 6. Define a function to train the ML model
    def train_model(X_train, y_train):
        # Create a pipeline with TF-IDF vectorization and a linear regression model
        text_pipeline = Pipeline([
            ('tfidf', TfidfVectorizer(max_features=5000)),
            ('model', LinearRegression())
        ])
        text_pipeline.fit(X_train, y_train)
        return text_pipeline

    # Train the model and evaluate its performance
    model = train_model(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    print(f'Mean Squared Error: {mse}')

    # Display the top 5 words that correlate with GP (games played)
    feature_names = model.named_steps['tfidf'].get_feature_names_out()
    coef = model.named_steps['model'].coef_
    top_5_words = [feature_names[i] for i in coef.argsort()[-5:][::-1]]
    print(f'Top 5 words that correlate with GP: {top_5_words}')

    # Optionally, display the merged DataFrame
    print(merged_df)
else:
    print("No valid data to process. Check the speech recognition process.")

# 7. Save the player_stats DataFrame to a CSV file
player_stats.to_csv('drafted_players.csv', index=False)


HTTPError: HTTP Error 403: Forbidden

C:\Users\jesse\NHL_Prospect_Interview_Performance_Correlation_Data_Engineering_Project
