In [None]:
from bs4 import BeautifulSoup
from pytube import YouTube
import speech_recognition as sr
from pydub import AudioSegment, silence
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from urllib.request import urlopen, Request
import spacy
from concurrent.futures import ThreadPoolExecutor

# Initialize spaCy
nlp = spacy.load("en_core_web_sm")

# Initialize a speech recognition recognizer
recognizer = sr.Recognizer()

# Define a function to download audio from a YouTube video URL and transcribe it
def transcribe_audio(youtube_url):
    try:
        # Download the YouTube video
        youtube = YouTube(youtube_url)

        # Get the audio stream
        audio_stream = youtube.streams.filter(only_audio=True).first()

        # Download the audio
        audio_file = audio_stream.download()

        # Convert the audio to WAV format using PyDub
        audio = AudioSegment.from_file(audio_file)
        wav_file = audio_file.replace(".webm", ".wav")  # Change the extension
        audio.export(wav_file, format="wav")

        # Load the audio file for recognition
        with sr.AudioFile(wav_file) as source:
            audio_data = recognizer.record(source)

        return audio_data, wav_file  # Return both the audio data and the WAV file path
    except Exception as e:
        print(f"Error transcribing audio: {str(e)}")
        return None, ""

# Function to split audio based on silence threshold
def split_audio_on_silence(audio, silence_thresh=35):
    audio_parts = silence.split_on_silence(audio, silence_thresh=silence_thresh)
    return audio_parts

# Function to process player data
def process_player_data(player_data):
    playerName, youtube_url = player_data

    # Transcribe the audio from the YouTube video
    audio_data, wav_file = transcribe_audio(youtube_url)

    if audio_data is None:
        return None

    # Load the audio file for silence detection
    audio = AudioSegment.from_file(wav_file)

    # Split the audio based on silence
    audio_parts = split_audio_on_silence(audio)

    # Concatenate the audio parts to get rid of silence
    concatenated_audio = AudioSegment.silent()
    for part in audio_parts:
        concatenated_audio += part

    # Perform speech recognition on the concatenated audio
    try:
        print(f"Processing data for {playerName}...")
        text = recognizer.recognize_google(audio_data)

        # Use spaCy for sentence segmentation
        doc = nlp(text)
        sentences = [sent.text for sent in doc.sents]

        # Separate sentences into questions and answers
        questions = sentences[::2]
        answers = sentences[1::2]

        # Check if array sizes are different and remove the last question if needed
        if len(questions) > len(answers):
            questions.pop()

        # Create a player dataframe
        player_df = pd.DataFrame({
            'PlayerName': [playerName] * len(answers),
            'Questions': questions,
            'Answers': answers,
        })

        return player_df
    except sr.UnknownValueError:
        print(f"Google Speech Recognition could not understand audio for {playerName}. Skipping...")
        return None
    except sr.RequestError as e:
        print(f"Could not request results from Google Speech Recognition service for {playerName}; {str(e)}. Skipping...")
        return None

# Read the CSV file
csv_file = "interviews.csv"
df = pd.read_csv(csv_file)

# Read the CSV file containing player stats (player_name, games_played)
url = "https://www.hockeydb.com/ihdb/draft/nhl2016e.html"
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
req = Request(url, headers=headers)
html = urlopen(req)
soup = BeautifulSoup(html, 'html.parser')
table = soup.find('table')

# Extract headers from the first row
headers = [th.text.strip() for th in table.find_all('tr')[1].find_all('th')]

# Create empty lists to store the table data
data = []

# Extract the table rows (skip the first row as it contains headers)
for row in table.find_all('tr')[2:]:
    row_data = [cell.text.strip() for cell in row.find_all('td')]
    data.append(row_data)

# Create the player_stats DataFrame
player_stats = pd.DataFrame(data, columns=headers)

# Create a list of player data to process
player_data_list = [(row['playerName'], row['url']) for _, row in df.iterrows()]

# Function to train the machine learning model
def train_model(X_train, y_train):
    # Define a text processing pipeline using TF-IDF vectorization and linear regression
    text_pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(max_features=5000)),  # Adjust max_features as needed
        ('model', LinearRegression())
    ])

    # Train the model
    text_pipeline.fit(X_train, y_train)

    return text_pipeline

# Function to process player data and return relevant information
def process_player_data(player_data):
    playerName, youtube_url = player_data

    # Transcribe the audio from the YouTube video
    audio_data, wav_file = transcribe_audio(youtube_url)

    if audio_data is None:
        return None

    # Load the audio file for silence detection
    audio = AudioSegment.from_file(wav_file)

    # Split the audio based on silence
    audio_parts = split_audio_on_silence(audio)

    # Concatenate the audio parts to get rid of silence
    concatenated_audio = AudioSegment.silent()
    for part in audio_parts:
        concatenated_audio += part

    # Perform speech recognition on the concatenated audio
    try:
        print(f"Processing data for {playerName}...")
        text = recognizer.recognize_google(audio_data)

        # Use spaCy for sentence segmentation
        doc = nlp(text)
        sentences = [sent.text for sent in doc.sents]

        # Separate sentences into questions and answers
        questions = sentences[::2]
        answers = sentences[1::2]

        # Check if array sizes are different and remove the last question if needed
        if len(questions) > len(answers):
            questions.pop()

        # Create a player dataframe
        player_df = pd.DataFrame({
            'PlayerName': [playerName] * len(answers),
            'Questions': questions,
            'Answers': answers,
        })

        return player_df
    except sr.UnknownValueError:
        print(f"Google Speech Recognition could not understand audio for {playerName}. Skipping...")
        return None
    except sr.RequestError as e:
        print(f"Could not request results from Google Speech Recognition service for {playerName}; {str(e)}. Skipping...")
        return None

# Process player data concurrently
with ThreadPoolExecutor() as executor:
    processed_results = list(executor.map(process_player_data, player_data_list))

# Filter out None results
processed_results = [result for result in processed_results if result is not None]

# Concatenate player dataframes into a final dataframe if there are any
if processed_results:
    final_df = pd.concat(processed_results, ignore_index=True)

    # Merge with player_stats DataFrame on player name
    merged_df = pd.merge(final_df, player_stats, left_on='PlayerName', right_on='Player', how='inner')

    # Define features (answers) and target (games_played)
    X = merged_df['Answers']
    y = merged_df['GP'].astype(int)  # Convert games played to integer

    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train the machine learning model
    model = train_model(X_train, y_train)

    # Evaluate the model
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    print(f'Mean Squared Error: {mse}')

    # Display the top 5 words that correlate with 'GP'
    feature_names = model.named_steps['tfidf'].get_feature_names_out()
    coef = model.named_steps['model'].coef_
    top_5_words = [feature_names[i] for i in coef.argsort()[-5:][::-1]]
    print(f'Top 5 words that correlate with GP: {top_5_words}')

    # Display the final dataframe
    print(merged_df)
else:
    print("No valid data to concatenate. Check the speech recognition process.")
player_stats.to_csv('drafted_players.csv', index=False)

In [None]:
display(final_df)