In [2]:
# Step 1: Load and Preprocess the CSV File

import pandas as pd

# Load the CSV file
file_path = '/Users/hemantg/Desktop/dl-project-scraped-final.csv'  # Update this path
df = pd.read_csv(file_path)

# Display the first few rows
print("Original Data:")
print(df.head())

# Preprocessing
# Remove extra whitespace and newlines in text columns
df['title'] = df['title'].str.strip().replace(r'\s+', ' ', regex=True)
df['content'] = df['content'].str.strip().replace(r'\s+', ' ', regex=True)

# Standardize the date format
df['date'] = pd.to_datetime(df['date'], errors='coerce', dayfirst=True)

# Drop rows with invalid dates
df = df.dropna(subset=['date'])

# Sort the DataFrame by date
df = df.sort_values(by='date').reset_index(drop=True)

# Display preprocessed data
print("\nPreprocessed Data:")
print(df.head())



Original Data:
                                                 url  \
0  https://www.espncricinfo.com/story/bcci-launch...   
1  https://www.espncricinfo.com/story/warne-joins...   
2  https://www.espncricinfo.com/story/mcgrath-hop...   
3  https://www.espncricinfo.com/story/will-twenty...   
4  https://www.espncricinfo.com/story/jayawardene...   

                                               title       date  \
0             International Twenty20 league launched  13-Sep-07   
1  Warne joins player pool for Indian Twenty20 le...  16-Sep-07   
2       McGrath hopes Twenty20 stays as third format  19-Sep-07   
3                       Will Twenty20 wreck cricket?  23-Sep-07   
4    Jayawardene among eight Sri Lankans to join IPL   1-Oct-07   

                                             content Unnamed: 4 Unnamed: 5  \
0  Stephen Fleming and Glenn McGrath at the launc...        NaN        NaN   
1  Shane Warne and Glenn McGrath could soon be pl...        NaN        NaN   
2  Glenn Mc

  df['date'] = pd.to_datetime(df['date'], errors='coerce', dayfirst=True)


In [3]:
# Step 1: Load and Preprocess the CSV File

import pandas as pd

# Load the CSV file
file_path = '/Users/hemantg/Desktop/dl-project-scraped-final.csv'  # Update this path
df = pd.read_csv(file_path)

# Remove unnamed columns
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

# Display the first few rows
print("Original Data (Cleaned):")
print(df.head())

# Preprocessing
# Remove extra whitespace and newlines in text columns
df['title'] = df['title'].str.strip().replace(r'\s+', ' ', regex=True)
df['content'] = df['content'].str.strip().replace(r'\s+', ' ', regex=True)

# Standardize the date format
df['date'] = pd.to_datetime(df['date'], errors='coerce', dayfirst=True)

# Drop rows with invalid dates
df = df.dropna(subset=['date'])

# Sort the DataFrame by date
df = df.sort_values(by='date').reset_index(drop=True)

# Display preprocessed data
print("\nPreprocessed Data:")
print(df.head())



Original Data (Cleaned):
                                                 url  \
0  https://www.espncricinfo.com/story/bcci-launch...   
1  https://www.espncricinfo.com/story/warne-joins...   
2  https://www.espncricinfo.com/story/mcgrath-hop...   
3  https://www.espncricinfo.com/story/will-twenty...   
4  https://www.espncricinfo.com/story/jayawardene...   

                                               title       date  \
0             International Twenty20 league launched  13-Sep-07   
1  Warne joins player pool for Indian Twenty20 le...  16-Sep-07   
2       McGrath hopes Twenty20 stays as third format  19-Sep-07   
3                       Will Twenty20 wreck cricket?  23-Sep-07   
4    Jayawardene among eight Sri Lankans to join IPL   1-Oct-07   

                                             content  
0  Stephen Fleming and Glenn McGrath at the launc...  
1  Shane Warne and Glenn McGrath could soon be pl...  
2  Glenn McGrath has not played cricket since the...  
3  The fans hav

  df['date'] = pd.to_datetime(df['date'], errors='coerce', dayfirst=True)


In [4]:
# Step 1: Load and Preprocess the CSV File

import pandas as pd

# Load the CSV file
file_path = '/Users/hemantg/Desktop/dl-project-scraped-final.csv'  # Update this path
df = pd.read_csv(file_path)

# Remove unnamed columns
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

# Display the first few rows
print("Original Data (Cleaned):")
print(df.head())

# Preprocessing
# Remove extra whitespace and newlines in text columns
df['title'] = df['title'].str.strip().replace(r'\s+', ' ', regex=True)
df['content'] = df['content'].str.strip().replace(r'\s+', ' ', regex=True)

# Standardize the date format explicitly
df['date'] = pd.to_datetime(df['date'], format='%d-%b-%y', errors='coerce')

# Drop rows with invalid dates
df = df.dropna(subset=['date'])

# Sort the DataFrame by date
df = df.sort_values(by='date').reset_index(drop=True)

# Display preprocessed data
print("\nPreprocessed Data:")
print(df.head())



Original Data (Cleaned):
                                                 url  \
0  https://www.espncricinfo.com/story/bcci-launch...   
1  https://www.espncricinfo.com/story/warne-joins...   
2  https://www.espncricinfo.com/story/mcgrath-hop...   
3  https://www.espncricinfo.com/story/will-twenty...   
4  https://www.espncricinfo.com/story/jayawardene...   

                                               title       date  \
0             International Twenty20 league launched  13-Sep-07   
1  Warne joins player pool for Indian Twenty20 le...  16-Sep-07   
2       McGrath hopes Twenty20 stays as third format  19-Sep-07   
3                       Will Twenty20 wreck cricket?  23-Sep-07   
4    Jayawardene among eight Sri Lankans to join IPL   1-Oct-07   

                                             content  
0  Stephen Fleming and Glenn McGrath at the launc...  
1  Shane Warne and Glenn McGrath could soon be pl...  
2  Glenn McGrath has not played cricket since the...  
3  The fans hav

In [5]:
# Step 2: Sentiment Analysis Using Grok API
import requests
import json
import pandas as pd
import time

# Define the Grok API Key and Endpoint
API_KEY = "xai-s8uBO0ByYKXw3QQ48PRQnieUBYCAPQQFdICsVeyTWYfNRqkGyFbrqIpOMnAXQx7N3D2vm3kRMsTdHe0c"
API_URL = "https://api.x.ai/v1/chat/completions"

# Function to Analyze Sentiment Using Grok API with Retry Logic
def analyze_sentiment_grok(text, index):
    headers = {
        "Authorization": f"Bearer {API_KEY}",
        "Content-Type": "application/json"
    }
    payload = {
        "model": "grok-beta",
        "messages": [
            {"role": "system", "content": "You are a sentiment analysis assistant."},
            {"role": "user", "content": f"Analyze the sentiment of the following text: {text}"}
        ],
        "temperature": 0
    }

    retries = 3
    for attempt in range(retries):
        response = requests.post(API_URL, headers=headers, json=payload)
        
        if response.status_code == 200:
            result = response.json()
            completion_text = result["choices"][0]["message"]["content"]

            # Extract sentiment information
            label = "Neutral"
            score = 0.0

            if "Positive" in completion_text:
                label = "Positive"
                score = 0.9
            elif "Negative" in completion_text:
                label = "Negative"
                score = 0.1

            # Print progress every 100 articles
            if index % 100 == 0:
                print(f"Processed {index} articles...")

            return label, score, completion_text

        elif response.status_code in [429, 500]:  # Retry on rate-limit or server error
            print(f"Rate limit or server error at index {index}. Retrying in 30 seconds...")
            time.sleep(30)
        else:
            print(f"API Error at index {index}: {response.status_code}, {response.text}")
            return "Neutral", 0.0, "Error in response"

    # If all retries fail
    return "Neutral", 0.0, "Max retries exceeded"


# **Save Progress Logic**
batch_size = 500

for i in range(0, len(df), batch_size):
    batch_df = df.iloc[i:i+batch_size].copy()
    
    # Apply Sentiment Analysis
    batch_df[['sentiment_label', 'sentiment_score', 'raw_response']] = batch_df['content'].apply(
        lambda x: pd.Series(analyze_sentiment_grok(x, i))
    )

    # Save Progress After Each Batch
    batch_df.to_csv(f'/kaggle/working/sentiment_analysis_batch_{i}.csv', index=False)
    print(f"Saved batch {i} to CSV.")

print("\nSentiment Analysis Completed Successfully!")


Processed 0 articles...
Processed 0 articles...
Processed 0 articles...
Processed 0 articles...
Processed 0 articles...
Processed 0 articles...
Processed 0 articles...
Processed 0 articles...
Processed 0 articles...
Processed 0 articles...
Processed 0 articles...
Processed 0 articles...
Processed 0 articles...
Processed 0 articles...
Processed 0 articles...
Processed 0 articles...
Processed 0 articles...
Processed 0 articles...
Processed 0 articles...
Processed 0 articles...
Processed 0 articles...
Processed 0 articles...
Processed 0 articles...
Processed 0 articles...
Processed 0 articles...
Processed 0 articles...
Processed 0 articles...
Processed 0 articles...
Processed 0 articles...
Processed 0 articles...
Processed 0 articles...
Processed 0 articles...
Processed 0 articles...
Processed 0 articles...
Processed 0 articles...
Processed 0 articles...
Processed 0 articles...
Processed 0 articles...
Processed 0 articles...
Processed 0 articles...
Processed 0 articles...


KeyboardInterrupt: 

In [6]:
# Step 2: Sentiment Analysis Using Grok API
import requests
import json
import pandas as pd
import time

# Define the Grok API Key and Endpoint
API_KEY = "xai-s8uBO0ByYKXw3QQ48PRQnieUBYCAPQQFdICsVeyTWYfNRqkGyFbrqIpOMnAXQx7N3D2vm3kRMsTdHe0c"
API_URL = "https://api.x.ai/v1/chat/completions"

# Function to Analyze Sentiment Using Grok API
def analyze_sentiment_grok(text, row_index):
    headers = {
        "Authorization": f"Bearer {API_KEY}",
        "Content-Type": "application/json"
    }
    payload = {
        "model": "grok-beta",
        "messages": [
            {"role": "system", "content": "You are a sentiment analysis assistant."},
            {"role": "user", "content": f"Analyze the sentiment of the following text: {text}"}
        ],
        "temperature": 0
    }

    retries = 3
    for attempt in range(retries):
        response = requests.post(API_URL, headers=headers, json=payload)

        if response.status_code == 200:
            result = response.json()
            completion_text = result["choices"][0]["message"]["content"]

            # Extract sentiment information
            label = "Neutral"
            score = 0.0

            if "Positive" in completion_text:
                label = "Positive"
                score = 0.9
            elif "Negative" in completion_text:
                label = "Negative"
                score = 0.1

            # Print progress for each article
            print(f"Processed article {row_index}: Sentiment={label}, Score={score}")

            return label, score, completion_text

        elif response.status_code in [429, 500]:  # Retry on rate-limit or server error
            print(f"Rate limit or server error at index {row_index}. Retrying in 30 seconds...")
            time.sleep(30)
        else:
            print(f"API Error at index {row_index}: {response.status_code}, {response.text}")
            return "Neutral", 0.0, "Error in response"

    # If all retries fail
    return "Neutral", 0.0, "Max retries exceeded"


# **Save Progress Logic**
batch_size = 500

for i in range(0, len(df), batch_size):
    batch_df = df.iloc[i:i+batch_size].copy()
    
    # Apply Sentiment Analysis with Correct Index and Real-Time Output
    batch_df[['sentiment_label', 'sentiment_score', 'raw_response']] = batch_df.apply(
        lambda row: pd.Series(analyze_sentiment_grok(row['content'], row.name)), axis=1
    )

    # Save Progress After Each Batch
    batch_df.to_csv(f'/Users/hemantg/Desktop/sentiment_analysis_batch_{i}.csv', index=False)
    print(f"Saved batch {i} to CSV.")

print("\nSentiment Analysis Completed Successfully!")


Processed article 0: Sentiment=Positive, Score=0.9
Processed article 1: Sentiment=Positive, Score=0.9
Processed article 2: Sentiment=Positive, Score=0.9
Processed article 3: Sentiment=Positive, Score=0.9
Processed article 4: Sentiment=Neutral, Score=0.0
Processed article 5: Sentiment=Positive, Score=0.9
Processed article 6: Sentiment=Positive, Score=0.9
Processed article 7: Sentiment=Positive, Score=0.9
Processed article 8: Sentiment=Negative, Score=0.1
Processed article 9: Sentiment=Neutral, Score=0.0
Processed article 10: Sentiment=Positive, Score=0.9
Processed article 11: Sentiment=Positive, Score=0.9
Processed article 12: Sentiment=Positive, Score=0.9
Processed article 13: Sentiment=Positive, Score=0.9
Processed article 14: Sentiment=Positive, Score=0.9
Processed article 15: Sentiment=Positive, Score=0.9
Processed article 16: Sentiment=Neutral, Score=0.0
Processed article 17: Sentiment=Positive, Score=0.9
Processed article 18: Sentiment=Positive, Score=0.9
Processed article 19: Sen

KeyboardInterrupt: 

In [7]:
# Function to Extract Contextual and Context-Discovered Features Using Grok
def extract_dynamic_context_grok(text, row_index):
    headers = {
        "Authorization": f"Bearer {API_KEY}",
        "Content-Type": "application/json"
    }
    
    # New Prompt for Feature Discovery
    prompt = f"""
    You are a cricket performance analyst. Extract features from this cricket article related to predicting a player's future performance.

    1. Extract the following features:
       - Player Mentioned
       - Runs Scored (if any)
       - Wickets Taken (if any)
       - Match Context (location, opponent)
       - Mention of Injuries or Selection Updates
       - Predicted Player Confidence Level (0 to 1)
       - Reason for Praise or Criticism (short text)

    2. Additionally, look for **context-specific features** related to predicting the player's future performance:
       - Emerging performance indicators (e.g., tactical mentions, team dynamics).
       - Strategy hints (e.g., promotion in batting order, bowling changes).
       - Important insights **not listed above**.

    3. Provide a **short justification** explaining why these features were selected.

    Article: {text}
    """

    payload = {
        "model": "grok-beta",
        "messages": [
            {"role": "system", "content": "You are a cricket performance analyst."},
            {"role": "user", "content": prompt}
        ],
        "temperature": 0
    }

    retries = 3
    for attempt in range(retries):
        response = requests.post(API_URL, headers=headers, json=payload)

        if response.status_code == 200:
            result = response.json()
            completion_text = result["choices"][0]["message"]["content"]
            print(f"Processed article {row_index}: Extracted Contextual Features")
            return completion_text

        elif response.status_code in [429, 500]:  # Retry on rate-limit or server error
            print(f"Rate limit or server error at index {row_index}. Retrying in 30 seconds...")
            time.sleep(30)
        else:
            print(f"API Error at index {row_index}: {response.status_code}, {response.text}")
            return "Error in response"

    return "Max retries exceeded"

# Batch Processing with Date Sorting
batch_size = 500

for i in range(0, len(df), batch_size):
    batch_df = df.iloc[i:i+batch_size].copy()

    # Extract Contextual Features from Each Article
    batch_df['dynamic_features'] = batch_df.apply(
        lambda row: extract_dynamic_context_grok(row['content'], row.name), axis=1
    )

    # Ensure Sorting by Date
    batch_df = batch_df.sort_values(by='date', ascending=True)

    # Save Progress After Each Batch
    batch_df.to_csv(f'/Users/hemantg/Desktop/sentiment_analysis_batch_{i}.csv', index=False)
    print(f"Saved batch {i} with dynamic context features to CSV (sorted by date).")

print("\nDynamic Contextual Feature Extraction and Date Sorting Completed Successfully!")


Processed article 0: Extracted Contextual Features
Processed article 1: Extracted Contextual Features
Processed article 2: Extracted Contextual Features
Processed article 3: Extracted Contextual Features
Processed article 4: Extracted Contextual Features
Processed article 5: Extracted Contextual Features
Processed article 6: Extracted Contextual Features
Processed article 7: Extracted Contextual Features
Processed article 8: Extracted Contextual Features
Processed article 9: Extracted Contextual Features
Processed article 10: Extracted Contextual Features
Processed article 11: Extracted Contextual Features
Processed article 12: Extracted Contextual Features
Processed article 13: Extracted Contextual Features
Processed article 14: Extracted Contextual Features
Processed article 15: Extracted Contextual Features
Processed article 16: Extracted Contextual Features
Processed article 17: Extracted Contextual Features
Processed article 18: Extracted Contextual Features
Processed article 19: 

Processed article 157: Extracted Contextual Features
Processed article 158: Extracted Contextual Features
Processed article 159: Extracted Contextual Features
Processed article 160: Extracted Contextual Features
Processed article 161: Extracted Contextual Features
Processed article 162: Extracted Contextual Features
Processed article 163: Extracted Contextual Features
Processed article 164: Extracted Contextual Features
Processed article 165: Extracted Contextual Features
Processed article 166: Extracted Contextual Features
Processed article 167: Extracted Contextual Features
Processed article 168: Extracted Contextual Features
Processed article 169: Extracted Contextual Features
Processed article 170: Extracted Contextual Features
Processed article 171: Extracted Contextual Features
Processed article 172: Extracted Contextual Features
Processed article 173: Extracted Contextual Features
Processed article 174: Extracted Contextual Features
Processed article 175: Extracted Contextual Fe

Processed article 312: Extracted Contextual Features
Processed article 313: Extracted Contextual Features
Processed article 314: Extracted Contextual Features
Processed article 315: Extracted Contextual Features
Processed article 316: Extracted Contextual Features
Processed article 317: Extracted Contextual Features
Processed article 318: Extracted Contextual Features
Processed article 319: Extracted Contextual Features
Processed article 320: Extracted Contextual Features
Processed article 321: Extracted Contextual Features
Processed article 322: Extracted Contextual Features
Processed article 323: Extracted Contextual Features
Processed article 324: Extracted Contextual Features
Processed article 325: Extracted Contextual Features
Processed article 326: Extracted Contextual Features
Processed article 327: Extracted Contextual Features
Processed article 328: Extracted Contextual Features
Processed article 329: Extracted Contextual Features
Processed article 330: Extracted Contextual Fe

Processed article 467: Extracted Contextual Features
Processed article 468: Extracted Contextual Features
Processed article 469: Extracted Contextual Features
Processed article 470: Extracted Contextual Features
Processed article 471: Extracted Contextual Features
Processed article 472: Extracted Contextual Features
Processed article 473: Extracted Contextual Features
Processed article 474: Extracted Contextual Features
Processed article 475: Extracted Contextual Features
Processed article 476: Extracted Contextual Features
Processed article 477: Extracted Contextual Features
Processed article 478: Extracted Contextual Features
Processed article 479: Extracted Contextual Features
Processed article 480: Extracted Contextual Features
Processed article 481: Extracted Contextual Features
Processed article 482: Extracted Contextual Features
Processed article 483: Extracted Contextual Features
Processed article 484: Extracted Contextual Features
Processed article 485: Extracted Contextual Fe

ConnectionError: HTTPSConnectionPool(host='api.x.ai', port=443): Max retries exceeded with url: /v1/chat/completions (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x152c50110>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known'))