In [11]:
# Step 1: Load and Preprocess the CSV File

import pandas as pd

# Load the CSV file
file_path = '/Users/hemantg/Desktop/d-project-scraped-final.csv'  # Update this path
df = pd.read_csv(file_path)

# Display the first few rows
print("Original Data:")
print(df.head())

# Preprocessing
# Remove extra whitespace and newlines in text columns
df['title'] = df['title'].str.strip().replace(r'\s+', ' ', regex=True)
df['content'] = df['content'].str.strip().replace(r'\s+', ' ', regex=True)

# Standardize the date format
df['date'] = pd.to_datetime(df['date'], errors='coerce', dayfirst=True)

# Drop rows with invalid dates
df = df.dropna(subset=['date'])

# Sort the DataFrame by date
df = df.sort_values(by='date').reset_index(drop=True)

# Display preprocessed data
print("\nPreprocessed Data:")
print(df.head())



Original Data:
                                                 url  \
0  https://www.espncricinfo.com/story/bcci-launch...   
1  https://www.espncricinfo.com/story/warne-joins...   
2  https://www.espncricinfo.com/story/mcgrath-hop...   
3  https://www.espncricinfo.com/story/will-twenty...   
4  https://www.espncricinfo.com/story/jayawardene...   

                                               title       date  \
0             International Twenty20 league launched  13-Sep-07   
1  Warne joins player pool for Indian Twenty20 le...  16-Sep-07   
2       McGrath hopes Twenty20 stays as third format  19-Sep-07   
3                       Will Twenty20 wreck cricket?  23-Sep-07   
4    Jayawardene among eight Sri Lankans to join IPL   1-Oct-07   

                                             content Unnamed: 4 Unnamed: 5  \
0  Stephen Fleming and Glenn McGrath at the launc...        NaN        NaN   
1  Shane Warne and Glenn McGrath could soon be pl...        NaN        NaN   
2  Glenn Mc

  df['date'] = pd.to_datetime(df['date'], errors='coerce', dayfirst=True)


In [12]:
# Step 1: Load and Preprocess the CSV File

import pandas as pd

# Load the CSV file
file_path = '/Users/hemantg/Desktop/d-project-scraped-final.csv'  # Update this path
df = pd.read_csv(file_path)

# Remove unnamed columns
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

# Display the first few rows
print("Original Data (Cleaned):")
print(df.head())

# Preprocessing
# Remove extra whitespace and newlines in text columns
df['title'] = df['title'].str.strip().replace(r'\s+', ' ', regex=True)
df['content'] = df['content'].str.strip().replace(r'\s+', ' ', regex=True)

# Standardize the date format
df['date'] = pd.to_datetime(df['date'], errors='coerce', dayfirst=True)

# Drop rows with invalid dates
df = df.dropna(subset=['date'])

# Sort the DataFrame by date
df = df.sort_values(by='date').reset_index(drop=True)

# Display preprocessed data
print("\nPreprocessed Data:")
print(df.head())



Original Data (Cleaned):
                                                 url  \
0  https://www.espncricinfo.com/story/bcci-launch...   
1  https://www.espncricinfo.com/story/warne-joins...   
2  https://www.espncricinfo.com/story/mcgrath-hop...   
3  https://www.espncricinfo.com/story/will-twenty...   
4  https://www.espncricinfo.com/story/jayawardene...   

                                               title       date  \
0             International Twenty20 league launched  13-Sep-07   
1  Warne joins player pool for Indian Twenty20 le...  16-Sep-07   
2       McGrath hopes Twenty20 stays as third format  19-Sep-07   
3                       Will Twenty20 wreck cricket?  23-Sep-07   
4    Jayawardene among eight Sri Lankans to join IPL   1-Oct-07   

                                             content  
0  Stephen Fleming and Glenn McGrath at the launc...  
1  Shane Warne and Glenn McGrath could soon be pl...  
2  Glenn McGrath has not played cricket since the...  
3  The fans hav

  df['date'] = pd.to_datetime(df['date'], errors='coerce', dayfirst=True)


In [13]:
# Step 1: Load and Preprocess the CSV File

import pandas as pd

# Load the CSV file
file_path = '/Users/hemantg/Desktop/dl-project-scraped-final.csv'  # Update this path
df = pd.read_csv(file_path)

# Remove unnamed columns
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

# Display the first few rows
print("Original Data (Cleaned):")
print(df.head())

# Preprocessing
# Remove extra whitespace and newlines in text columns
df['title'] = df['title'].str.strip().replace(r'\s+', ' ', regex=True)
df['content'] = df['content'].str.strip().replace(r'\s+', ' ', regex=True)

# Standardize the date format explicitly
df['date'] = pd.to_datetime(df['date'], format='%d-%b-%y', errors='coerce')

# Drop rows with invalid dates
df = df.dropna(subset=['date'])

# Sort the DataFrame by date
df = df.sort_values(by='date').reset_index(drop=True)

# Display preprocessed data
print("\nPreprocessed Data:")
print(df.head())



Original Data (Cleaned):
                                                 url  \
0  https://www.espncricinfo.com/story/bcci-launch...   
1  https://www.espncricinfo.com/story/warne-joins...   
2  https://www.espncricinfo.com/story/mcgrath-hop...   
3  https://www.espncricinfo.com/story/will-twenty...   
4  https://www.espncricinfo.com/story/jayawardene...   

                                               title       date  \
0             International Twenty20 league launched  13-Sep-07   
1  Warne joins player pool for Indian Twenty20 le...  16-Sep-07   
2       McGrath hopes Twenty20 stays as third format  19-Sep-07   
3                       Will Twenty20 wreck cricket?  23-Sep-07   
4    Jayawardene among eight Sri Lankans to join IPL   1-Oct-07   

                                             content  
0  Stephen Fleming and Glenn McGrath at the launc...  
1  Shane Warne and Glenn McGrath could soon be pl...  
2  Glenn McGrath has not played cricket since the...  
3  The fans hav

In [18]:
# Step 2: Sentiment Analysis Using Grok API
import requests
import json

# Define the Grok API Key and Endpoint
API_KEY = "xai-wstXyvOcCGFeQlgy2LtyFzM1GLXDtCiXBrJO1snASustMyqOMWo4uKLwtQgVyYCgCGDOxODvW5RTz6B7"
API_URL = "https://api.x.ai/v1/chat/completions"

# Function to Analyze Sentiment Using Grok API
def analyze_sentiment_grok(text):
    headers = {
        "Authorization": f"Bearer {API_KEY}",
        "Content-Type": "application/json"
    }
    payload = {
        "model": "grok-beta",
        "messages": [
            {"role": "system", "content": "You are a sentiment analysis assistant."},
            {"role": "user", "content": f"Analyze the sentiment of the following text: {text}"}
        ],
        "temperature": 0
    }

    response = requests.post(API_URL, headers=headers, json=payload)
    if response.status_code == 200:
        result = response.json()
        completion_text = result["choices"][0]["message"]["content"]
        # Extract sentiment label and score from the response
        label = "Neutral"
        score = 0.0
        
        # Example logic for extracting sentiment information (adjust if needed)
        if "Positive" in completion_text:
            label = "Positive"
            score = 0.9
        elif "Negative" in completion_text:
            label = "Negative"
            score = 0.1
        
        return label, score, completion_text
    else:
        print(f"API Error: {response.status_code}, {response.text}")
        return "Neutral", 0.0, "Error in response"      

# Apply Sentiment Analysis to the DataFrame
df[['sentiment_label', 'sentiment_score', 'raw_response']] = df['content'].apply(
    lambda x: pd.Series(analyze_sentiment_grok(x))
)

# Display Results
print("\nData with Sentiment Analysis from Grok:")
print(df[['title', 'date', 'sentiment_label', 'sentiment_score', 'raw_response']].head())


API Error: 520, <!DOCTYPE html>
<!--[if lt IE 7]> <html class="no-js ie6 oldie" lang="en-US"> <![endif]-->
<!--[if IE 7]>    <html class="no-js ie7 oldie" lang="en-US"> <![endif]-->
<!--[if IE 8]>    <html class="no-js ie8 oldie" lang="en-US"> <![endif]-->
<!--[if gt IE 8]><!--> <html class="no-js" lang="en-US"> <!--<![endif]-->
<head>


<title>api.x.ai | 520: Web server is returning an unknown error</title>
<meta charset="UTF-8" />
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
<meta http-equiv="X-UA-Compatible" content="IE=Edge" />
<meta name="robots" content="noindex, nofollow" />
<meta name="viewport" content="width=device-width,initial-scale=1" />
<link rel="stylesheet" id="cf_styles-css" href="/cdn-cgi/styles/main.css" />


</head>
<body>
<div id="cf-wrapper">
    <div id="cf-error-details" class="p-0">
        <header class="mx-auto pt-10 lg:pt-6 lg:px-8 w-240 lg:w-full mb-8">
            <h1 class="inline-block sm:block sm:mb-2 font-light text-60 lg:text-

ReadTimeout: HTTPSConnectionPool(host='api.x.ai', port=443): Read timed out. (read timeout=None)

In [None]:
import requests

# Define OpenAI Batch API Endpoint and Headers
batch_endpoint = 'https://api.openai.com/v1/batches'
headers = {
    'Authorization': f'Bearer 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',  # Replace with your actual API key
    'Content-Type': 'application/json'
}

# Provide the Cloud Storage URL of Your Uploaded File
uploaded_file_url = "https://YOUR_STORAGE_PROVIDER_URL/batch_input.jsonl"

# Define the Batch Request Payload
batch_payload = {
    "input_file": uploaded_file_url,  # URL of the uploaded .jsonl file
    "output_file": "batch_output.jsonl",  # Desired output file name
    "model": "gpt-4.0-mini",
    "temperature": 0
}

# Submit the Batch Request
response = requests.post(batch_endpoint, headers=headers, json=batch_payload)

# Handle Response
if response.status_code == 200:
    batch_id = response.json()['id']
    print(f"Batch submitted successfully. Batch ID: {batch_id}")
else:
    print(f"Batch submission failed: {response.status_code} - {response.text}")


In [19]:
import pandas as pd
import os

def combine_csvs(input_directory, output_filename, merge_method='concat'):
    """
    Combine multiple CSV files from a specified directory.
    
    Parameters:
    - input_directory: Path to the directory containing CSV files
    - output_filename: Name of the output combined CSV file
    - merge_method: Method of combining CSVs ('concat' or 'merge')
    
    Returns:
    - Combined DataFrame
    - Saves combined DataFrame to a CSV file
    """
    # Get a list of all CSV files in the directory
    csv_files = [f for f in os.listdir(input_directory) if f.endswith('.csv')]
    
    if not csv_files:
        raise ValueError("No CSV files found in the specified directory")
    
    # Read all CSV files
    dataframes = []
    for file in csv_files:
        file_path = os.path.join(input_directory, file)
        df = pd.read_csv(file_path)
        dataframes.append(df)
    
    # Combine DataFrames based on merge method
    if merge_method == 'concat':
        # Simple concatenation (stacks DataFrames vertically)
        combined_df = pd.concat(dataframes, ignore_index=True)
    elif merge_method == 'merge':
        # Merge DataFrames (requires a common column)
        combined_df = dataframes[0]
        for df in dataframes[1:]:
            combined_df = pd.merge(combined_df, df, how='outer')
    else:
        raise ValueError("Invalid merge method. Choose 'concat' or 'merge'")
    
    # Save combined DataFrame to CSV
    combined_df.to_csv(output_filename, index=False)
    
    print(f"Combined {len(csv_files)} CSV files")
    print(f"Output saved to {output_filename}")
    
    return combined_df

# Example usage
if __name__ == "__main__":
    # Replace these with your actual paths
    input_dir = '/Users/hemantg/Downloads/output-3files'
    output_file = '/Users/hemantg/Downloads/combined_output.csv'
    
    # Combine CSVs using concatenation
    result_concat = combine_csvs(input_dir, output_file, merge_method='concat')
    
    # If you need to merge on a specific column, use 'merge' method
    # result_merge = combine_csvs(input_dir, output_file, merge_method='merge')

Combined 3 CSV files
Output saved to /Users/hemantg/Downloads/combined_output.csv


In [None]:
import pandas as pd
import os
import natsort  # for natural sorting of filenames

def combine_sorted_csvs(input_directory, output_filename):
    """
    Combine CSV files from a directory, sorted by filename in a natural order.
    
    Parameters:
    - input_directory: Path to the directory containing CSV files
    - output_filename: Name of the output combined CSV file
    
    Returns:
    - Combined DataFrame
    - Saves combined DataFrame to a CSV file
    """
    # Get a list of CSV files, sorted naturally
    csv_files = natsort.natsorted(
        [f for f in os.listdir(input_directory) if f.endswith('.csv')]
    )
    
    if not csv_files:
        raise ValueError("No CSV files found in the specified directory")
    
    # Read and combine sorted CSV files
    dataframes = []
    for file in csv_files:
        file_path = os.path.join(input_directory, file)
        df = pd.read_csv(file_path)
        
        # Optional: Add a column to track original filename if needed
        df['source_file'] = file
        
        dataframes.append(df)
        print(f"Added file: {file}")
    
    # Concatenate DataFrames
    combined_df = pd.concat(dataframes, ignore_index=True)
    
    # Save combined DataFrame to CSV
    combined_df.to_csv(output_filename, index=False)
    
    print(f"\nCombined {len(csv_files)} CSV files")
    print(f"Files sorted and combined in order: {', '.join(csv_files)}")
    print(f"Output saved to {output_filename}")
    
    return combined_df

# Example usage
if __name__ == "__main__":
    # Replace with your actual directory path
    input_dir = '/Users/hemantg/Downloads/output-3files'
    output_file = '/Users/hemantg/Downloads/combined_output1.csv'
    
    # Combine sorted CSVs
    result = combine_sorted_csvs(input_dir, output_file)

ModuleNotFoundError: No module named 'natsort'

In [21]:
import pandas as pd
import os

def combine_sorted_csvs(input_directory, output_filename):
    """
    Combine CSV files from a directory, sorted by filename.
    
    Parameters:
    - input_directory: Path to the directory containing CSV files
    - output_filename: Name of the output combined CSV file
    
    Returns:
    - Combined DataFrame
    - Saves combined DataFrame to a CSV file
    """
    # Get a list of CSV files, sorted alphabetically
    csv_files = sorted([f for f in os.listdir(input_directory) if f.endswith('.csv')])
    
    if not csv_files:
        raise ValueError("No CSV files found in the specified directory")
    
    # Read and combine sorted CSV files
    dataframes = []
    for file in csv_files:
        file_path = os.path.join(input_directory, file)
        df = pd.read_csv(file_path)
        
        # Optional: Add a column to track original filename if needed
        df['source_file'] = file
        
        dataframes.append(df)
        print(f"Added file: {file}")
    
    # Concatenate DataFrames
    combined_df = pd.concat(dataframes, ignore_index=True)
    
    # Save combined DataFrame to CSV
    combined_df.to_csv(output_filename, index=False)
    
    print(f"\nCombined {len(csv_files)} CSV files")
    print(f"Files sorted and combined in order: {', '.join(csv_files)}")
    print(f"Output saved to {output_filename}")
    
    return combined_df

# Example usage
if __name__ == "__main__":
    # Replace with your actual directory path
    input_dir = '/Users/hemantg/Downloads/output-3files'
    output_file = '/Users/hemantg/Downloads/combined_output1.csv'
    
    # Combine sorted CSVs
    result = combine_sorted_csvs(input_dir, output_file)

Added file: output_1of3.csv
Added file: output_2of3.csv
Added file: output_3of3.csv

Combined 3 CSV files
Files sorted and combined in order: output_1of3.csv, output_2of3.csv, output_3of3.csv
Output saved to /Users/hemantg/Downloads/combined_output1.csv


In [22]:
import pandas as pd
import json

# Load the CSV
input_file = '/Users/hemantg/Desktop/try-combined-output.csv'  # Replace with the actual file path
df = pd.read_csv(input_file, sep='\t')

# Initialize a list to store processed rows
processed_rows = []

# Process each row
for index, row in df.iterrows():
    date = row['date']
    processed_results = row['processed_results']

    try:
        # Load the JSON data
        players_data = json.loads(processed_results)
        row_data = {'date': date}
        
        # Extract player information
        for i, player in enumerate(players_data):
            row_data[f'player{i+1}_name'] = player['player_name']
            row_data[f'player{i+1}_output'] = json.dumps(player)
        
        processed_rows.append(row_data)
    
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON for row {index}: {e}")

# Create a new DataFrame
output_df = pd.DataFrame(processed_rows)

# Save to a new CSV file
output_file = '/Users/hemantg/Desktop/try-processed_output.csv'
output_df.to_csv(output_file, index=False)

print(f"Processed CSV saved at: {output_file}")


KeyError: 'date'

In [23]:
import pandas as pd
import json

# Load the CSV
input_file = '/Users/hemantg/Desktop/try-combined-output.csv'  # Replace with the actual file path
df = pd.read_csv(input_file, sep='\t')

# Inspect columns
print("Columns in the DataFrame:", df.columns)

# Initialize a list to store processed rows
processed_rows = []

# Process each row
for index, row in df.iterrows():
    try:
        date = row['date'] if 'date' in df.columns else row[df.columns[0]]
        processed_results = row['processed_results']

        # Load the JSON data
        players_data = json.loads(processed_results)
        row_data = {'date': date}
        
        # Extract player information
        for i, player in enumerate(players_data):
            row_data[f'player{i+1}_name'] = player['player_name']
            row_data[f'player{i+1}_output'] = json.dumps(player)
        
        processed_rows.append(row_data)
    
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON for row {index}: {e}")
    except KeyError as e:
        print(f"Missing column: {e}")

# Create a new DataFrame
output_df = pd.DataFrame(processed_rows)

# Save to a new CSV file
output_file = '/Users/hemantg/Desktop/try-processed_output.csv'
output_df.to_csv(output_file, index=False)

print(f"Processed CSV saved at: {output_file}")


Columns in the DataFrame: Index(['date,original_text,processed_results'], dtype='object')
Missing column: 'processed_results'
Missing column: 'processed_results'
Missing column: 'processed_results'
Missing column: 'processed_results'
Missing column: 'processed_results'
Missing column: 'processed_results'
Missing column: 'processed_results'
Missing column: 'processed_results'
Missing column: 'processed_results'
Missing column: 'processed_results'
Missing column: 'processed_results'
Missing column: 'processed_results'
Missing column: 'processed_results'
Missing column: 'processed_results'
Missing column: 'processed_results'
Missing column: 'processed_results'
Missing column: 'processed_results'
Missing column: 'processed_results'
Missing column: 'processed_results'
Missing column: 'processed_results'
Missing column: 'processed_results'
Missing column: 'processed_results'
Missing column: 'processed_results'
Missing column: 'processed_results'
Missing column: 'processed_results'
Missing co

In [1]:
import pandas as pd
import json

# Load the CSV with the correct delimiter
input_file = '/Users/hemantg/Desktop/try-combined-output.csv'  # Replace with the actual file path
df = pd.read_csv(input_file, sep=',')

# Inspect the first few rows
print("Columns in the DataFrame:", df.columns)

# Initialize a list to store processed rows
processed_rows = []

# Process each row
for index, row in df.iterrows():
    try:
        date = row['date']
        processed_results = row['processed_results']

        # Load the JSON data
        players_data = json.loads(processed_results)
        row_data = {'date': date}
        
        # Extract player information
        for i, player in enumerate(players_data):
            row_data[f'player{i+1}_name'] = player['player_name']
            row_data[f'player{i+1}_output'] = json.dumps(player)
        
        processed_rows.append(row_data)
    
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON for row {index}: {e}")
    except KeyError as e:
        print(f"Missing column: {e}")

# Create a new DataFrame
output_df = pd.DataFrame(processed_rows)

# Save to a new CSV file
output_file = '/Users/hemantg/Desktop/try-processed_output.csv'
output_df.to_csv(output_file, index=False)

print(f"Processed CSV saved at: {output_file}")


Columns in the DataFrame: Index(['date', 'original_text', 'processed_results'], dtype='object')
Error decoding JSON for row 0: Expecting value: line 1 column 1 (char 0)
Error decoding JSON for row 1: Expecting value: line 1 column 1 (char 0)
Error decoding JSON for row 2: Expecting value: line 1 column 1 (char 0)
Error decoding JSON for row 3: Expecting value: line 1 column 1 (char 0)
Error decoding JSON for row 4: Expecting value: line 1 column 1 (char 0)
Error decoding JSON for row 5: Expecting value: line 1 column 1 (char 0)
Error decoding JSON for row 6: Expecting value: line 1 column 1 (char 0)
Error decoding JSON for row 7: Expecting value: line 1 column 1 (char 0)
Error decoding JSON for row 8: Expecting value: line 1 column 1 (char 0)
Error decoding JSON for row 9: Expecting value: line 1 column 1 (char 0)
Error decoding JSON for row 10: Expecting value: line 1 column 1 (char 0)
Error decoding JSON for row 11: Expecting value: line 1 column 1 (char 0)
Error decoding JSON for ro

In [2]:
# Correct the header if necessary
df.columns = ['date', 'original_text', 'processed_results']

# Inspect the first few rows
print("Corrected Columns in the DataFrame:", df.columns)


Corrected Columns in the DataFrame: Index(['date', 'original_text', 'processed_results'], dtype='object')


In [3]:
import pandas as pd
import json

# Load the CSV with the correct delimiter
input_file = '/Users/hemantg/Desktop/try-combined-output.csv'  # Replace with the actual file path
df = pd.read_csv(input_file, sep=',')

# Inspect the first few rows
print("Columns in the DataFrame:", df.columns)

# Initialize a list to store processed rows
processed_rows = []

# Process each row
for index, row in df.iterrows():
    try:
        date = row['date']
        processed_results = row['processed_results']
        
        # Check if the processed_results field is not empty
        if pd.notna(processed_results) and processed_results.strip():
            # Load the JSON data
            players_data = json.loads(processed_results)
            row_data = {'date': date}
            
            # Extract player information
            for i, player in enumerate(players_data):
                row_data[f'player{i+1}_name'] = player['player_name']
                row_data[f'player{i+1}_output'] = json.dumps(player)
            
            processed_rows.append(row_data)
        else:
            print(f"Empty processed_results at row {index}")
    
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON for row {index}: {e}")
    except KeyError as e:
        print(f"Missing column: {e}")

# Create a new DataFrame
output_df = pd.DataFrame(processed_rows)

# Save to a new CSV file
output_file = '/Users/hemantg/Desktop/try-processed_output.csv'
output_df.to_csv(output_file, index=False)

print(f"Processed CSV saved at: {output_file}")


Columns in the DataFrame: Index(['date', 'original_text', 'processed_results'], dtype='object')
Error decoding JSON for row 0: Expecting value: line 1 column 1 (char 0)
Error decoding JSON for row 1: Expecting value: line 1 column 1 (char 0)
Error decoding JSON for row 2: Expecting value: line 1 column 1 (char 0)
Error decoding JSON for row 3: Expecting value: line 1 column 1 (char 0)
Error decoding JSON for row 4: Expecting value: line 1 column 1 (char 0)
Error decoding JSON for row 5: Expecting value: line 1 column 1 (char 0)
Error decoding JSON for row 6: Expecting value: line 1 column 1 (char 0)
Error decoding JSON for row 7: Expecting value: line 1 column 1 (char 0)
Error decoding JSON for row 8: Expecting value: line 1 column 1 (char 0)
Error decoding JSON for row 9: Expecting value: line 1 column 1 (char 0)
Error decoding JSON for row 10: Expecting value: line 1 column 1 (char 0)
Error decoding JSON for row 11: Expecting value: line 1 column 1 (char 0)
Error decoding JSON for ro

In [4]:
# Inspect the first few rows of the processed_results column
print(df['processed_results'].head(10))


0    ```json\n[\n  {\n    "player_name": "Mohammad_...
1    ```json\n[\n  {\n    "player_name": "Shane_War...
2    ```json\n[\n  {\n    "player_name": "Brett_Lee...
3    ```json\n[\n  {\n    "player_name": "Adam_Gilc...
4    ```json\n[\n  {\n    "player_name": "Rohit_Sha...
5    ```json\n[\n  {\n    "player_name": "Kevin_Pie...
6    ```json\n[\n  {\n    "player_name": "Dimitri_M...
7    ```json\n[\n  {\n    "player_name": "Daniel_Ve...
8    ```json\n[\n  {\n    "player_name": "Dimitri_M...
9    ```json\n[\n  {\n    "player_name": "Nathan_Br...
Name: processed_results, dtype: object


In [5]:
import pandas as pd
import json

# Load the CSV
input_file = '/Users/hemantg/Desktop/try-combined-output.csv'  # Replace with the actual file path
df = pd.read_csv(input_file, sep=',')

# Inspect columns
print("Columns in the DataFrame:", df.columns)

# Initialize a list to store processed rows
processed_rows = []

# Process each row
for index, row in df.iterrows():
    try:
        date = row['date']
        processed_results = row['processed_results']

        # Ensure the value is not empty
        if pd.notna(processed_results) and processed_results.strip():
            # Clean and fix the JSON format
            cleaned_json = (
                processed_results.strip()
                .replace("```json", "")
                .replace("```", "")
                .strip()
            )

            # Parse JSON
            players_data = json.loads(cleaned_json)
            row_data = {'date': date}

            # Extract player information
            for i, player in enumerate(players_data):
                row_data[f'player{i+1}_name'] = player['player_name']
                row_data[f'player{i+1}_output'] = json.dumps(player)

            processed_rows.append(row_data)
        else:
            print(f"Empty or invalid processed_results at row {index}")

    except json.JSONDecodeError as e:
        print(f"Error decoding JSON for row {index}: {e}")
    except KeyError as e:
        print(f"Missing column: {e}")

# Create a new DataFrame
output_df = pd.DataFrame(processed_rows)

# Save to a new CSV file
output_file = '/Users/hemantg/Desktop/try-processed_output.csv'
output_df.to_csv(output_file, index=False)

print(f"Processed CSV saved at: {output_file}")


Columns in the DataFrame: Index(['date', 'original_text', 'processed_results'], dtype='object')
Error decoding JSON for row 35: Expecting property name enclosed in double quotes: line 3 column 5 (char 10)
Error decoding JSON for row 82: Expecting property name enclosed in double quotes: line 3 column 5 (char 10)
Processed CSV saved at: /Users/hemantg/Desktop/try-processed_output.csv


In [6]:
import pandas as pd
import json

# Load the CSV
input_file = '/Users/hemantg/Desktop/try-combined-output.csv'  # Replace with the actual file path
df = pd.read_csv(input_file, sep=',')

# Inspect columns
print("Columns in the DataFrame:", df.columns)

# Initialize a list to store processed rows
processed_rows = []

# Process each row
for index, row in df.iterrows():
    try:
        date = row['date']
        processed_results = row['processed_results']

        # Ensure the value is not empty
        if pd.notna(processed_results) and processed_results.strip():
            # Clean and fix the JSON format
            cleaned_json = (
                processed_results.strip()
                .replace("```json", "")
                .replace("```", "")
                .replace('""', '"')  # Fix incorrect quotes
                .strip()
            )

            # Parse JSON
            players_data = json.loads(cleaned_json)
            row_data = {'date': date}

            # Extract player information
            for i, player in enumerate(players_data):
                row_data[f'player{i+1}_name'] = player['player_name']
                row_data[f'player{i+1}_output'] = json.dumps(player)

            processed_rows.append(row_data)
        else:
            print(f"Empty or invalid processed_results at row {index}")

    except json.JSONDecodeError as e:
        print(f"Error decoding JSON for row {index}: {e}")
    except KeyError as e:
        print(f"Missing column: {e}")

# Create a new DataFrame
output_df = pd.DataFrame(processed_rows)

# Save to a new CSV file
output_file = '/Users/hemantg/Desktop/try-processed_output.csv'
output_df.to_csv(output_file, index=False)

print(f"Processed CSV saved at: {output_file}")


Columns in the DataFrame: Index(['date', 'original_text', 'processed_results'], dtype='object')
Error decoding JSON for row 35: Expecting property name enclosed in double quotes: line 3 column 5 (char 10)
Error decoding JSON for row 82: Expecting property name enclosed in double quotes: line 3 column 5 (char 10)
Processed CSV saved at: /Users/hemantg/Desktop/try-processed_output.csv


In [11]:
import pandas as pd
import json

# Load the CSV
input_file = '/Users/hemantg/Downloads/combined_output1.csv'  # Replace with the actual file path
df = pd.read_csv(input_file, sep=',')

# Inspect columns
print("Columns in the DataFrame:", df.columns)

# Initialize a list to store processed rows
processed_rows = []

# Process each row
for index, row in df.iterrows():
    try:
        date = row['date']
        processed_results = row['processed_results']

        # Ensure the value is not empty
        if pd.notna(processed_results) and processed_results.strip():
            # Clean and fix the JSON format
            cleaned_json = (
                processed_results.strip()
                .replace("```json", "")
                .replace("```", "")
                .replace('""', '"')  # Fix incorrect quotes
                .strip()
            )

            # Parse JSON
            players_data = json.loads(cleaned_json)
            row_data = {'date': date}

            # Extract player information
            for i, player in enumerate(players_data):
                row_data[f'player{i+1}_name'] = player['player_name']
                row_data[f'player{i+1}_output'] = json.dumps(player)

            processed_rows.append(row_data)
        else:
            print(f"Empty or invalid processed_results at row {index}")

    except json.JSONDecodeError as e:
        print(f"Error decoding JSON for row {index}: {e}")
    except KeyError as e:
        print(f"Missing column: {e}")

# Create a new DataFrame
output_df = pd.DataFrame(processed_rows)

# Save to a new CSV file
output_file = '/Users/hemantg/Desktop/processed_output.csv'
output_df.to_csv(output_file, index=False)

print(f"Processed CSV saved at: {output_file}")


Columns in the DataFrame: Index(['date', 'original_text', 'processed_results'], dtype='object')
Error decoding JSON for row 35: Expecting property name enclosed in double quotes: line 3 column 5 (char 10)
Error decoding JSON for row 82: Expecting property name enclosed in double quotes: line 3 column 5 (char 10)
Error decoding JSON for row 112: Expecting property name enclosed in double quotes: line 3 column 5 (char 10)
Error decoding JSON for row 122: Expecting value: line 36 column 16 (char 1690)
Error decoding JSON for row 129: Expecting property name enclosed in double quotes: line 3 column 5 (char 10)
Error decoding JSON for row 130: Expecting property name enclosed in double quotes: line 86 column 102 (char 3906)
Error decoding JSON for row 133: Unterminated string starting at: line 83 column 20 (char 3934)
Error decoding JSON for row 189: Expecting value: line 85 column 16 (char 3813)
Error decoding JSON for row 190: Unterminated string starting at: line 84 column 5 (char 4052)


In [8]:
import pandas as pd
import json
import re

# Load the CSV
input_file = '/Users/hemantg/Desktop/try-combined-output.csv'  # Replace with the actual file path
df = pd.read_csv(input_file, sep=',')

# Inspect columns
print("Columns in the DataFrame:", df.columns)

# Initialize a list to store processed rows
processed_rows = []

# Function to clean JSON
def clean_json_string(json_string):
    cleaned = (
        json_string.strip()
        .replace("```json", "")
        .replace("```", "")
        .replace("“", '"')  # Smart quotes to standard
        .replace("”", '"')
        .replace("‘", "'")
        .replace("’", "'")
        .replace("'", '"')  # Fix single quotes
    )
    # Remove any non-JSON text outside braces
    cleaned = re.sub(r'^[^{]*|[^}]*$', '', cleaned, flags=re.DOTALL)
    return cleaned

# Process each row
for index, row in df.iterrows():
    try:
        date = row['date']
        processed_results = row['processed_results']

        # Ensure the value is not empty
        if pd.notna(processed_results) and processed_results.strip():
            cleaned_json = clean_json_string(processed_results)

            # Parse JSON
            players_data = json.loads(cleaned_json)
            row_data = {'date': date}

            # Extract player information
            for i, player in enumerate(players_data):
                row_data[f'player{i+1}_name'] = player['player_name']
                row_data[f'player{i+1}_output'] = json.dumps(player)

            processed_rows.append(row_data)
        else:
            print(f"Empty or invalid processed_results at row {index}")

    except json.JSONDecodeError as e:
        print(f"Error decoding JSON for row {index}: {e}")
    except KeyError as e:
        print(f"Missing column: {e}")

# Create a new DataFrame
output_df = pd.DataFrame(processed_rows)

# Save to a new CSV file
output_file = '/Users/hemantg/Desktop/try-processed_output.csv'
output_df.to_csv(output_file, index=False)

print(f"Processed CSV saved at: {output_file}")


Columns in the DataFrame: Index(['date', 'original_text', 'processed_results'], dtype='object')


TypeError: string indices must be integers, not 'str'

In [9]:
import pandas as pd
import json
import re

# Load the CSV
input_file = '/Users/hemantg/Desktop/try-combined-output.csv'  # Replace with the actual file path
df = pd.read_csv(input_file, sep=',')

# Inspect columns
print("Columns in the DataFrame:", df.columns)

# Initialize a list to store processed rows
processed_rows = []

# Function to clean JSON
def clean_json_string(json_string):
    cleaned = (
        json_string.strip()
        .replace("```json", "")
        .replace("```", "")
        .replace("“", '"')  # Smart quotes to standard
        .replace("”", '"')
        .replace("‘", "'")
        .replace("’", "'")
        .replace("'", '"')  # Fix single quotes
    )
    # Remove any non-JSON text outside braces
    cleaned = re.sub(r'^[^{\[]*|[^}\]]*$', '', cleaned, flags=re.DOTALL)
    return cleaned

# Process each row
for index, row in df.iterrows():
    try:
        date = row['date']
        processed_results = row['processed_results']

        # Ensure the value is not empty
        if pd.notna(processed_results) and processed_results.strip():
            cleaned_json = clean_json_string(processed_results)

            # Parse JSON
            players_data = json.loads(cleaned_json)
            
            # Ensure it's a list
            if isinstance(players_data, list):
                row_data = {'date': date}

                # Extract player information
                for i, player in enumerate(players_data):
                    row_data[f'player{i+1}_name'] = player['player_name']
                    row_data[f'player{i+1}_output'] = json.dumps(player)

                processed_rows.append(row_data)
            else:
                print(f"Unexpected data type at row {index}: {type(players_data)}")
        else:
            print(f"Empty or invalid processed_results at row {index}")

    except json.JSONDecodeError as e:
        print(f"Error decoding JSON for row {index}: {e}")
    except KeyError as e:
        print(f"Missing column: {e}")
    except TypeError as e:
        print(f"TypeError at row {index}: {e}")

# Create a new DataFrame
output_df = pd.DataFrame(processed_rows)

# Save to a new CSV file
output_file = '/Users/hemantg/Desktop/try-processed_output.csv'
output_df.to_csv(output_file, index=False)

print(f"Processed CSV saved at: {output_file}")


Columns in the DataFrame: Index(['date', 'original_text', 'processed_results'], dtype='object')
Error decoding JSON for row 2: Expecting ',' delimiter: line 6 column 85 (char 346)
Error decoding JSON for row 12: Expecting ',' delimiter: line 5 column 137 (char 270)
Error decoding JSON for row 14: Expecting ',' delimiter: line 5 column 85 (char 256)
Error decoding JSON for row 16: Expecting ',' delimiter: line 4 column 53 (char 94)
Error decoding JSON for row 17: Expecting ',' delimiter: line 14 column 38 (char 825)
Error decoding JSON for row 30: Expecting ',' delimiter: line 22 column 59 (char 1177)
Error decoding JSON for row 32: Expecting ',' delimiter: line 21 column 151 (char 1407)
Error decoding JSON for row 33: Expecting ',' delimiter: line 6 column 49 (char 365)
Error decoding JSON for row 36: Expecting ',' delimiter: line 6 column 33 (char 286)
Error decoding JSON for row 37: Expecting ',' delimiter: line 5 column 88 (char 191)
Error decoding JSON for row 38: Expecting ',' del

In [12]:
import pandas as pd
import json
import re

# Load your DataFrame
df = pd.read_csv("/Users/hemantg/Downloads/combined_output1.csv")  # Replace with your actual file

# Function to clean JSON strings
def clean_json_string(json_str):
    try:
        # Attempt to decode the original JSON string
        return json.loads(json_str)
    except json.JSONDecodeError as e:
        # Fix common issues in JSON string
        fixed_str = json_str

        # Fix missing double quotes around property names
        fixed_str = re.sub(r"(\{|,)(\s*)(\w+)(\s*):", r'\1 "\3":', fixed_str)

        # Fix unterminated strings by closing quotes
        fixed_str = re.sub(r'(".*?)(?<!\\)(")(?!:|,|})', r'\1\\"', fixed_str)

        # Remove extra commas
        fixed_str = re.sub(r",\s*([}\]])", r"\1", fixed_str)

        # Remove invalid control characters
        fixed_str = re.sub(r"[\x00-\x1F]+", "", fixed_str)

        # Attempt to load the fixed JSON string
        try:
            return json.loads(fixed_str)
        except json.JSONDecodeError:
            return None

# Apply the cleaning function
df["cleaned_results"] = df["processed_results"].apply(clean_json_string)

# Drop rows with invalid JSON if needed
df.dropna(subset=["cleaned_results"], inplace=True)

# Save the cleaned DataFrame
df.to_csv("/Users/hemantg/Downloads/cleaned_file.csv", index=False)

print("JSON cleaning completed!")


JSON cleaning completed!


In [13]:
def clean_json_string(json_string):
    cleaned = (
        json_string.strip()
        .replace("```json", "")
        .replace("```", "")
        .replace("“", '"')  # Smart quotes to standard
        .replace("”", '"')
        .replace("‘", "'")
        .replace("’", "'")
        .replace("'", '"')  # Fix single quotes
    )
    
    # Correct missing commas using regex
    cleaned = re.sub(r'(\}\s*)(\{)', r'\1,\2', cleaned)
    
    # Remove non-JSON text outside braces
    cleaned = re.sub(r'^[^{\[]*|[^}\]]*$', '', cleaned, flags=re.DOTALL)
    return cleaned


In [14]:
import json

# Safer JSON parsing
def safe_json_loads(json_string):
    try:
        return json.loads(json_string, strict=False)
    except json.JSONDecodeError:
        return None


In [15]:
import pandas as pd
import json
import re

# Load the CSV
input_file = '/Users/hemantg/Desktop/try-combined-output.csv'  # Replace with the actual file path
df = pd.read_csv(input_file, sep=',')

# Inspect columns
print("Columns in the DataFrame:", df.columns)

# Initialize a list to store processed rows
processed_rows = []

# Function to clean JSON strings
def clean_json_string(json_string):
    cleaned = (
        json_string.strip()
        .replace("```json", "")
        .replace("```", "")
        .replace("“", '"')  # Smart quotes to standard
        .replace("”", '"')
        .replace("‘", "'")
        .replace("’", "'")
        .replace("'", '"')  # Fix single quotes
    )
    
    # Correct missing commas between JSON objects
    cleaned = re.sub(r'(\}\s*)(\{)', r'\1,\2', cleaned)
    
    # Remove non-JSON text outside braces
    cleaned = re.sub(r'^[^{\[]*|[^}\]]*$', '', cleaned, flags=re.DOTALL)
    return cleaned

# Safer JSON loading function
def safe_json_loads(json_string):
    try:
        return json.loads(json_string, strict=False)
    except json.JSONDecodeError:
        return None

# Process each row
for index, row in df.iterrows():
    try:
        date = row['date']
        processed_results = row['processed_results']

        # Ensure the value is not empty
        if pd.notna(processed_results) and processed_results.strip():
            cleaned_json = clean_json_string(processed_results)
            players_data = safe_json_loads(cleaned_json)

            if players_data is None:
                print(f"Error decoding JSON for row {index}: Invalid JSON structure")
                continue

            # Ensure it's a list
            if isinstance(players_data, list):
                row_data = {'date': date}

                # Extract player information
                for i, player in enumerate(players_data):
                    row_data[f'player{i+1}_name'] = player.get('player_name', 'N/A')
                    row_data[f'player{i+1}_output'] = json.dumps(player, ensure_ascii=False)

                processed_rows.append(row_data)
            else:
                print(f"Unexpected data type at row {index}: {type(players_data)}")
        else:
            print(f"Empty or invalid processed_results at row {index}")

    except KeyError as e:
        print(f"Missing column: {e}")
    except TypeError as e:
        print(f"TypeError at row {index}: {e}")

# Create a new DataFrame
output_df = pd.DataFrame(processed_rows)

# Save to a new CSV file
output_file = '/Users/hemantg/Desktop/try-processed_output2.csv'
output_df.to_csv(output_file, index=False)

print(f"Processed CSV saved at: {output_file}")


Columns in the DataFrame: Index(['date', 'original_text', 'processed_results'], dtype='object')
Error decoding JSON for row 2: Invalid JSON structure
Error decoding JSON for row 12: Invalid JSON structure
Error decoding JSON for row 14: Invalid JSON structure
Error decoding JSON for row 16: Invalid JSON structure
Error decoding JSON for row 17: Invalid JSON structure
Error decoding JSON for row 30: Invalid JSON structure
Error decoding JSON for row 32: Invalid JSON structure
Error decoding JSON for row 33: Invalid JSON structure
Error decoding JSON for row 36: Invalid JSON structure
Error decoding JSON for row 37: Invalid JSON structure
Error decoding JSON for row 38: Invalid JSON structure
Error decoding JSON for row 45: Invalid JSON structure
Error decoding JSON for row 47: Invalid JSON structure
Error decoding JSON for row 50: Invalid JSON structure
Error decoding JSON for row 52: Invalid JSON structure
Error decoding JSON for row 54: Invalid JSON structure
Error decoding JSON for r

In [16]:
import pandas as pd
import json
import re

# Load the CSV
input_file = '/Users/hemantg/Desktop/try-combined-output.csv'  # Replace with the actual file path
df = pd.read_csv(input_file, sep=',')

# Inspect columns
print("Columns in the DataFrame:", df.columns)

# Initialize a list to store processed rows
processed_rows = []

# Function to clean JSON strings
def clean_json_string(json_string):
    cleaned = (
        json_string.strip()
        .replace("```json", "")
        .replace("```", "")
        .replace("“", '"')  # Smart quotes to standard
        .replace("”", '"')
        .replace("‘", "'")
        .replace("’", "'")
        .replace("'", '"')  # Fix single quotes
    )
    
    # Correct missing commas between JSON objects
    cleaned = re.sub(r'(\}\s*)(\{)', r'\1,\2', cleaned)
    
    # Remove non-JSON text outside braces
    cleaned = re.sub(r'^[^{\[]*|[^}\]]*$', '', cleaned, flags=re.DOTALL)
    return cleaned

# Safer JSON loading function
def safe_json_loads(json_string, row_index):
    try:
        return json.loads(json_string, strict=False)
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON for row {row_index}: {e}")
        print(f"Problematic content: {json_string[:500]}")  # Show first 500 chars
        return None

# Process each row
for index, row in df.iterrows():
    try:
        date = row['date']
        processed_results = row['processed_results']

        # Ensure the value is not empty
        if pd.notna(processed_results) and processed_results.strip():
            cleaned_json = clean_json_string(processed_results)
            players_data = safe_json_loads(cleaned_json, index)

            if players_data is None:
                continue

            # Ensure it's a list
            if isinstance(players_data, list):
                row_data = {'date': date}

                # Extract player information
                for i, player in enumerate(players_data):
                    row_data[f'player{i+1}_name'] = player.get('player_name', 'N/A')
                    row_data[f'player{i+1}_output'] = json.dumps(player, ensure_ascii=False)

                processed_rows.append(row_data)
            else:
                print(f"Unexpected data type at row {index}: {type(players_data)}")
        else:
            print(f"Empty or invalid processed_results at row {index}")

    except KeyError as e:
        print(f"Missing column: {e}")
    except TypeError as e:
        print(f"TypeError at row {index}: {e}")

# Create a new DataFrame
output_df = pd.DataFrame(processed_rows)

# Save to a new CSV file
output_file = '/Users/hemantg/Desktop/try-processed_output3.csv'
output_df.to_csv(output_file, index=False)

print(f"Processed CSV saved at: {output_file}")


Columns in the DataFrame: Index(['date', 'original_text', 'processed_results'], dtype='object')
Error decoding JSON for row 2: Expecting ',' delimiter: line 6 column 85 (char 346)
Problematic content: [
  {
    "player_name": "Brett_Lee",
    "Metrics": "No specific metrics provided; known for fast bowling and Twenty20 experience.",
    "Insights": "Former fast bowler, strong T20 presence, experienced in international cricket, adaptable to various formats.",
    "Arguments": "Positively framed; believes IPL could enhance Australian players" opportunities in T20 cricket.",
    "match_performance": 0.8,
    "predicted_future_performance": 0.7
  }
]
Error decoding JSON for row 12: Expecting ',' delimiter: line 5 column 137 (char 270)
Problematic content: [
    {
        "player_name": "Lalit_Modi",
        "Metrics": "N/A - IPL Commissioner, no specific performance metrics discussed.",
        "Insights": "Involved in negotiations; understands player concerns and restrictions. Not a playe

In [17]:
import pandas as pd
import json
import re

# Load the CSV
input_file = '/Users/hemantg/Desktop/try-combined-output.csv'  # Replace with the actual file path
df = pd.read_csv(input_file, sep=',')

# Inspect columns
print("Columns in the DataFrame:", df.columns)

# Initialize a list to store processed rows
processed_rows = []

# Enhanced JSON cleaning function
def clean_json_string(json_string):
    cleaned = (
        json_string.strip()
        .replace("```json", "")
        .replace("```", "")
        .replace("“", '"')
        .replace("”", '"')
        .replace("‘", "'")
        .replace("’", "'")
        .replace("'", '"')  # Fix single quotes
    )

    # Fix common JSON issues:
    cleaned = re.sub(r'([^\\])"(?![:,}\]])', r'\1\\"', cleaned)  # Escape unescaped quotes inside text
    cleaned = re.sub(r'(\}\s*)(\{)', r'\1,\2', cleaned)  # Add missing commas between objects
    cleaned = re.sub(r'^[^{\[]*|[^}\]]*$', '', cleaned, flags=re.DOTALL)  # Remove non-JSON text outside braces
    return cleaned

# Safer JSON loading function
def safe_json_loads(json_string, row_index):
    try:
        return json.loads(json_string, strict=False)
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON for row {row_index}: {e}")
        print(f"Problematic content: {json_string[:500]}")  # Show first 500 chars
        return None

# Process each row
for index, row in df.iterrows():
    try:
        date = row['date']
        processed_results = row['processed_results']

        # Ensure the value is not empty
        if pd.notna(processed_results) and processed_results.strip():
            cleaned_json = clean_json_string(processed_results)
            players_data = safe_json_loads(cleaned_json, index)

            if players_data is None:
                continue

            # Ensure it's a list
            if isinstance(players_data, list):
                row_data = {'date': date}

                # Extract player information
                for i, player in enumerate(players_data):
                    row_data[f'player{i+1}_name'] = player.get('player_name', 'N/A')
                    row_data[f'player{i+1}_output'] = json.dumps(player, ensure_ascii=False)

                processed_rows.append(row_data)
            else:
                print(f"Unexpected data type at row {index}: {type(players_data)}")
        else:
            print(f"Empty or invalid processed_results at row {index}")

    except KeyError as e:
        print(f"Missing column: {e}")
    except TypeError as e:
        print(f"TypeError at row {index}: {e}")

# Create a new DataFrame
output_df = pd.DataFrame(processed_rows)

# Save to a new CSV file
output_file = '/Users/hemantg/Desktop/try-processed_output5.csv'
output_df.to_csv(output_file, index=False)

print(f"Processed CSV saved at: {output_file}")


Columns in the DataFrame: Index(['date', 'original_text', 'processed_results'], dtype='object')
Error decoding JSON for row 0: Expecting property name enclosed in double quotes: line 3 column 5 (char 10)
Problematic content: [
  {
    \"player_name": \"Mohammad_Yousuf",
    \"Metrics": \"Pending IPL participation due to ICL contract dispute; prior good performance in T20 leagues.",
    \"Insights": \"Known for aggressive batting style; past record includes high averages; currently facing contract issues affecting form.",
    \"Arguments": \"Negative sentiment due to ongoing legal challenges; uncertainty impacting selling in IPL auction.",
    \"match_performance": 0,
    \"predicted_future_performance": 0.3
  }
]
Error decoding JSON for row 1: Expecting property name enclosed in double quotes: line 3 column 5 (char 10)
Problematic content: [
  {
    \"player_name": \"Shane_Warne",
    \"Metrics": \"Notable player with a strong leadership role; promotes personal sponsor effectively.",
 

In [19]:
import pandas as pd
import openai
from tqdm import tqdm

# Set OpenAI API Key
openai.api_key = "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"

# Function to generate embeddings for a batch of texts
def get_batch_embeddings(texts, engine="text-embedding-3-large"):
    response = openai.Embedding.create(input=texts, engine=engine)
    return [item['embedding'] for item in response['data']]

# Load the CSV file
input_file = "/Users/hemantg/Downloads/data_ready.csv"  # Update this path to your file
data = pd.read_csv(input_file)

# Columns to generate embeddings for
text_columns = ["Metrics", "Insights", "Arguments"]
batch_size = 50  # Adjust the batch size depending on your API rate limits

# Process each column
for col in text_columns:
    embeddings = []
    texts = data[col].fillna("").tolist()  # Replace NaNs with empty strings
    
    # Process in batches
    for i in tqdm(range(0, len(texts), batch_size), desc=f"Processing {col}"):
        batch_texts = texts[i:i + batch_size]
        batch_embeddings = get_batch_embeddings(batch_texts)
        embeddings.extend(batch_embeddings)
    
    # Add the embeddings to the dataframe
    data[f"{col}_embeddings"] = embeddings

# Save the updated CSV
output_file = "/Users/hemantg/Downloads/player_performance_with_embeddings.csv"
data.to_csv(output_file, index=False)

print(f"Updated CSV saved as {output_file}")


Processing Metrics:   7%|▋         | 42/579 [01:35<20:16,  2.27s/it]


InvalidRequestError: '$.input' is invalid. Please check the API reference: https://platform.openai.com/docs/api-reference.

In [24]:
import pandas as pd
import openai
from tqdm import tqdm

openai.api_key = "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"

# Function to generate embeddings for a batch of texts
def get_batch_embeddings(texts, engine="text-embedding-3-large"):
    """
    Generate embeddings for a batch of texts.
    """
    # Ensure texts are strings
    texts = [str(text) for text in texts]
    response = openai.Embedding.create(input=texts, engine=engine)
    return [item['embedding'] for item in response['data']]

# Load the CSV file
input_file = "/Users/hemantg/Downloads/data_ready.csv"  # Update this path to your file
data = pd.read_csv(input_file)

# Columns to generate embeddings for
text_columns = ["Metrics", "Insights", "Arguments"]
batch_size = 500  # Adjust the batch size based on the API limits

# Process each column
for col in text_columns:
    embeddings = []
    texts = data[col].fillna("").tolist()  # Replace NaNs with empty strings
    
    # Process in batches
    for i in tqdm(range(0, len(texts), batch_size), desc=f"Processing {col}"):
        batch_texts = texts[i:i + batch_size]
        try:
            batch_embeddings = get_batch_embeddings(batch_texts)
            embeddings.extend(batch_embeddings)
        except Exception as e:
            print(f"Error processing batch {i}-{i + batch_size}: {e}")
            embeddings.extend([None] * len(batch_texts))  # Add placeholders for failed batches
    
    # Add the embeddings to the dataframe
    data[f"{col}_embeddings"] = embeddings

# Save the updated CSV
output_file = "/Users/hemantg/Downloads/player_performance_with_embeddings_final.csv"
data.to_csv(output_file, index=False)

print(f"Updated CSV saved as {output_file}")


Processing Metrics:   9%|▊         | 5/58 [00:16<02:30,  2.84s/it]

Error processing batch 2000-2500: '$.input' is invalid. Please check the API reference: https://platform.openai.com/docs/api-reference.


Processing Metrics:  10%|█         | 6/58 [00:16<01:46,  2.04s/it]

Error processing batch 2500-3000: '$.input' is invalid. Please check the API reference: https://platform.openai.com/docs/api-reference.


Processing Metrics:  19%|█▉        | 11/58 [00:34<02:08,  2.73s/it]

Error processing batch 5000-5500: '$.input' is invalid. Please check the API reference: https://platform.openai.com/docs/api-reference.


Processing Metrics:  29%|██▉       | 17/58 [00:55<02:06,  3.08s/it]

Error processing batch 8000-8500: '$.input' is invalid. Please check the API reference: https://platform.openai.com/docs/api-reference.


Processing Metrics:  38%|███▊      | 22/58 [01:14<01:53,  3.16s/it]

Error processing batch 10500-11000: '$.input' is invalid. Please check the API reference: https://platform.openai.com/docs/api-reference.


Processing Metrics:  53%|█████▎    | 31/58 [01:44<01:17,  2.88s/it]

Error processing batch 15000-15500: '$.input' is invalid. Please check the API reference: https://platform.openai.com/docs/api-reference.


Processing Metrics:  55%|█████▌    | 32/58 [01:45<00:56,  2.17s/it]

Error processing batch 15500-16000: '$.input' is invalid. Please check the API reference: https://platform.openai.com/docs/api-reference.


Processing Metrics:  84%|████████▍ | 49/58 [02:51<00:28,  3.15s/it]

Error processing batch 24000-24500: '$.input' is invalid. Please check the API reference: https://platform.openai.com/docs/api-reference.


Processing Metrics:  98%|█████████▊| 57/58 [03:20<00:02,  2.92s/it]

Error processing batch 28000-28500: '$.input' is invalid. Please check the API reference: https://platform.openai.com/docs/api-reference.


Processing Metrics: 100%|██████████| 58/58 [03:25<00:00,  3.54s/it]
Processing Insights:  17%|█▋        | 10/58 [00:42<02:48,  3.51s/it]

Error processing batch 4500-5000: '$.input' is invalid. Please check the API reference: https://platform.openai.com/docs/api-reference.


Processing Insights: 100%|██████████| 58/58 [04:25<00:00,  4.57s/it]
Processing Arguments:  17%|█▋        | 10/58 [00:38<02:13,  2.79s/it]

Error processing batch 4500-5000: '$.input' is invalid. Please check the API reference: https://platform.openai.com/docs/api-reference.


Processing Arguments: 100%|██████████| 58/58 [04:06<00:00,  4.24s/it]


Updated CSV saved as /Users/hemantg/Downloads/player_performance_with_embeddings_final.csv


In [25]:
import pandas as pd
import openai
from tqdm import tqdm


# Function to generate embeddings for a single text
def get_single_embedding(text, engine="text-embedding-3-large"):
    """
    Generate an embedding for a single text.
    """
    text = str(text)  # Ensure the text is a string
    response = openai.Embedding.create(input=[text], engine=engine)
    return response['data'][0]['embedding']

# Load the CSV file
input_file = "/Users/hemantg/Downloads/player_performance_with_embeddings_final.csv"  # Update this path to your file
data = pd.read_csv(input_file)

# Columns to generate embeddings for
text_columns = ["Metrics", "Insights", "Arguments"]

# Process each column
for col in text_columns:
    embedding_col = f"{col}_embeddings"

    # Check if the embedding column already exists
    if embedding_col not in data.columns:
        print(f"Embedding column {embedding_col} not found. Creating a new column with None values.")
        data[embedding_col] = None

    # Identify rows with missing embeddings
    rows_to_process = data[data[embedding_col].isna()]
    print(f"Processing {len(rows_to_process)} rows with missing {embedding_col}.")

    # Process rows sequentially
    for index, row in tqdm(rows_to_process.iterrows(), total=len(rows_to_process), desc=f"Processing {col}"):
        try:
            embedding = get_single_embedding(row[col])
            data.at[index, embedding_col] = embedding  # Update the embedding column
        except Exception as e:
            print(f"Error processing row {index}: {e}")
            continue

# Save the updated CSV
output_file = "/Users/hemantg/Downloads/player_performance_with_embeddings_final1.csv"
data.to_csv(output_file, index=False)

print(f"Updated CSV saved as {output_file}")


Processing 4500 rows with missing Metrics_embeddings.


Processing Metrics: 100%|██████████| 4500/4500 [59:34<00:00,  1.26it/s]  


Processing 500 rows with missing Insights_embeddings.


Processing Insights: 100%|██████████| 500/500 [06:25<00:00,  1.30it/s]


Processing 500 rows with missing Arguments_embeddings.


Processing Arguments: 100%|██████████| 500/500 [06:04<00:00,  1.37it/s]


Updated CSV saved as /Users/hemantg/Downloads/player_performance_with_embeddings_final1.csv


In [26]:
df1 = pd.read_csv("/Users/hemantg/Downloads/player_performance_with_embeddings_final1.csv")
df2 = pd.read_csv("/Users/hemantg/Downloads/player_performance_with_embeddings_final.csv")

Unnamed: 0,player_name,Metrics,Insights,Arguments,match_performance,predicted_future_performance,date,Metrics_embeddings,Insights_embeddings,Arguments_embeddings
0,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...
28904,False,False,False,False,False,False,False,False,False,False
28905,False,False,False,False,False,False,False,False,False,False
28906,False,False,False,False,False,False,False,False,False,False
28907,False,False,False,False,False,False,False,False,False,False


0        [-0.0035224382299929857, -0.03489496186375618,...
1        [-0.023905431851744652, -0.006605220027267933,...
2        [0.006208884529769421, 0.022419022396206856, -...
3        [0.02593395859003067, -0.024754418060183525, -...
4        [0.0013744279276579618, 0.009274468757212162, ...
                               ...                        
28904    [-0.0012793116038665175, -0.02274535596370697,...
28905    [0.023514321073889732, -0.023244429379701614, ...
28906    [0.009010076522827148, 0.00954776257276535, -0...
28907    [0.009952873922884464, -0.023141490295529366, ...
28908    [-0.008739352226257324, -0.01818043924868107, ...
Name: Arguments_embeddings, Length: 28909, dtype: object

In [None]:
import pandas as pd
import openai
from tqdm import tqdm

openai.api_key = "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"

# Function to generate embedding for a single text
def get_single_embedding(text, engine="text-embedding-3-large"):
    """
    Generate embedding for a single text.
    """
    try:
        # Ensure text is a string
        text = str(text) if text is not None else ""
        response = openai.Embedding.create(input=[text], engine=engine)
        return response['data'][0]['embedding']
    except Exception as e:
        print(f"Error generating embedding for text: {text[:50]}... Error: {e}")
        return None

# Load the CSV file
input_file = "/Users/hemantg/Desktop/DL Project Sem5 -- Nighter/data_ready.csv"  # Update this path to your file
data = pd.read_csv(input_file)

# Columns to generate embeddings for
text_columns = ["Metrics", "Insights", "Arguments"]

# Process each column
for col in text_columns:
    # Generate embeddings row by row
    embeddings = []
    for text in tqdm(data[col].fillna(""), desc=f"Processing {col}"):
        embedding = get_single_embedding(text)
        embeddings.append(embedding)
    
    # Add the embeddings to the dataframe
    data[f"{col}_embeddings"] = embeddings

# Save the updated CSV
output_file = "'/Users/hemantg/Desktop/DL Project Sem5 -- Nighter/player_performance_with_embeddings_final-squared.csv"
data.to_csv(output_file, index=False)

print(f"Updated CSV saved as {output_file}")

Processing Metrics:   7%|▋         | 2136/28909 [29:27<4:41:14,  1.59it/s] 

Error generating embedding for text: ... Error: '$.input' is invalid. Please check the API reference: https://platform.openai.com/docs/api-reference.


Processing Metrics:  10%|█         | 2966/28909 [42:04<5:11:27,  1.39it/s] 

Error generating embedding for text: ... Error: '$.input' is invalid. Please check the API reference: https://platform.openai.com/docs/api-reference.


Processing Metrics:  10%|█         | 2968/28909 [42:05<4:41:30,  1.54it/s]

Error generating embedding for text: ... Error: '$.input' is invalid. Please check the API reference: https://platform.openai.com/docs/api-reference.


Processing Metrics:  10%|█         | 2969/28909 [42:06<4:04:52,  1.77it/s]

Error generating embedding for text: ... Error: '$.input' is invalid. Please check the API reference: https://platform.openai.com/docs/api-reference.


Processing Metrics:  18%|█▊        | 5259/28909 [1:16:59<5:06:03,  1.29it/s] 

Error generating embedding for text: ... Error: '$.input' is invalid. Please check the API reference: https://platform.openai.com/docs/api-reference.


Processing Metrics:  21%|██        | 6018/28909 [1:30:13<5:21:38,  1.19it/s] 