In [2]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = '/content/50_Famous_Speechs.csv'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


ValueError: not enough values to unpack (expected 2, got 1)

In [3]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Introduction
***This notebook contains an analysis of the "Voices of History: 50 Iconic Speeches" dataset. We aim to perform sentiment analysis on the speeches using the VADER sentiment analysis tool and train a logistic regression model to classify the sentiments.***

# Step 1: Data Preparation
First, ensure you have the necessary libraries installed:

In [4]:
pip install pandas scikit-learn nltk tensorflow



**Then, load the dataset and prepare it for analysis:**

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Try different encodings if the first one doesn't work
encodings = ['utf-8', 'latin1', 'iso-8859-1', 'cp1252']

for encoding in encodings:
    try:
        # Load the dataset with the specified encoding
        df = pd.read_csv('/content/50_Famous_Speechs.csv', encoding=encoding)

        # Display the first few rows
        print(df.head())
        break
    except UnicodeDecodeError as e:
        print(f"Failed to read with encoding {encoding}: {e}")


Failed to read with encoding utf-8: 'utf-8' codec can't decode byte 0x93 in position 0: invalid start byte
                                 Title of the Speech  \
0                           1. I have a dream by MLK   
1             2. Tilbury Speech by Queen Elizabeth I   
2  3. Woodrow Wilson, address to Congress (April ...   
3            4. Ainât I A Woman by Sojourner Truth   
4       5. The Gettsyburg Address by Abraham Lincoln   

                                              Speech  
0  I have a dream that one day down in Alabama, ...  
1  My loving people,\n\nWe have been persuaded b...  
2  The world must be made safe for democracy. It...  
3  That man over there says that women need to b...  
4  Fondly do we hope, fervently do we pray, that...  


# Step 2: Sentiment Analysis
*For sentiment analysis, you can use pre-labeled data to train a model. Since your dataset might not be labeled for sentiment, you can use an existing sentiment analysis model like VADER (from NLTK) for labeling.*

**Labeling with VADER:**

In [7]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

nltk.download('vader_lexicon')
sia = SentimentIntensityAnalyzer()

# Function to get sentiment
def get_sentiment(text):
    scores = sia.polarity_scores(text)
    return 'positive' if scores['compound'] > 0 else 'negative'

df['Sentiment'] = df['Speech'].apply(get_sentiment)

# Display the labeled data
print(df.head())


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


                                 Title of the Speech  \
0                           1. I have a dream by MLK   
1             2. Tilbury Speech by Queen Elizabeth I   
2  3. Woodrow Wilson, address to Congress (April ...   
3            4. Ainât I A Woman by Sojourner Truth   
4       5. The Gettsyburg Address by Abraham Lincoln   

                                              Speech Sentiment  
0  I have a dream that one day down in Alabama, ...  positive  
1  My loving people,\n\nWe have been persuaded b...  positive  
2  The world must be made safe for democracy. It...  positive  
3  That man over there says that women need to b...  positive  
4  Fondly do we hope, fervently do we pray, that...  positive  


**Train/Test Split:**

In [8]:
X = df['Speech']
y = df['Sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


**Feature Extraction:**

In [9]:
vectorizer = TfidfVectorizer(max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)


**Model Training:
Using Logistic Regression for simplicity:**

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

model = LogisticRegression()
model.fit(X_train_vec, y_train)

# Predict and evaluate
y_pred = model.predict(X_test_vec)
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


0.5
              precision    recall  f1-score   support

    negative       0.00      0.00      0.00         4
    positive       0.50      1.00      0.67         4

    accuracy                           0.50         8
   macro avg       0.25      0.50      0.33         8
weighted avg       0.25      0.50      0.33         8



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Step 3: Topic Modeling
**For topic modeling, Latent Dirichlet Allocation (LDA) is commonly used:**

In [11]:
from sklearn.decomposition import LatentDirichletAllocation

# Vectorize the speeches
vectorizer = CountVectorizer(max_features=5000)
X_vec = vectorizer.fit_transform(df['Speech'])

# Train LDA model
lda = LatentDirichletAllocation(n_components=5, random_state=42)
lda.fit(X_vec)

# Display topics
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

no_top_words = 10
display_topics(lda, vectorizer.get_feature_names_out(), no_top_words)


Topic 0:
space next hundred years questions planet undoubted sacrificing position big
Topic 1:
war russia mankind history towards wants millions also nor tsarism
Topic 2:
the of and to that in we is for be
Topic 3:
the of and in to it not we is that
Topic 4:
black use position sanction white defending re move themselves now


# Step 4: Emotion Analysis
**For emotion analysis, you might want to use a pre-trained model such as the one available in the transformers library by Hugging Face:**

In [12]:
pip install transformers



**Using a pre-trained model for emotion classification:**

In [15]:
from google.colab import userdata
userdata.get('HF_TOKEN')

'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'

In [16]:
from transformers import pipeline

# Initialize emotion classification pipeline
emotion_classifier = pipeline('sentiment-analysis', model='j-hartmann/emotion-english-distilroberta-base')

# Function to classify emotions in text chunks
def classify_emotions(text):
    # Split the text into smaller chunks
    max_chunk_length = 512  # Maximum sequence length supported by the model
    chunks = [text[i:i + max_chunk_length] for i in range(0, len(text), max_chunk_length)]

    # Classify emotions in each chunk
    emotions = []
    for chunk in chunks:
        emotions.extend(emotion_classifier(chunk))

    return emotions

# Classify emotions in the speeches
df['Emotion'] = df['Speech'].apply(classify_emotions)

# Display the results
print(df.head())


config.json:   0%|          | 0.00/1.00k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/329M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/294 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

                                 Title of the Speech  \
0                           1. I have a dream by MLK   
1             2. Tilbury Speech by Queen Elizabeth I   
2  3. Woodrow Wilson, address to Congress (April ...   
3            4. Ainât I A Woman by Sojourner Truth   
4       5. The Gettsyburg Address by Abraham Lincoln   

                                              Speech Sentiment  \
0  I have a dream that one day down in Alabama, ...  positive   
1  My loving people,\n\nWe have been persuaded b...  positive   
2  The world must be made safe for democracy. It...  positive   
3  That man over there says that women need to b...  positive   
4  Fondly do we hope, fervently do we pray, that...  positive   

                                             Emotion  
0  [{'label': 'fear', 'score': 0.4142738878726959...  
1  [{'label': 'fear', 'score': 0.9686101078987122...  
2  [{'label': 'anger', 'score': 0.435908675193786...  
3  [{'label': 'anger', 'score': 0.589276134967

In [20]:
with open('emotion_analysis2.csv', 'w') as f:
    f.write('Speech,Emotion\n')
    for _, row in df.iterrows():
        for emotion in row['Emotion']:
            f.write(f'{row["Speech"]},{emotion["label"]}\n')

    print("Done")

Done


In [21]:
from transformers import pipeline
import pandas as pd

# Initialize emotion classification pipeline
emotion_classifier = pipeline('sentiment-analysis', model='j-hartmann/emotion-english-distilroberta-base')

# Function to classify emotions in text chunks
def classify_emotions(text):
    # Split the text into smaller chunks
    max_chunk_length = 512  # Maximum sequence length supported by the model
    chunks = [text[i:i + max_chunk_length] for i in range(0, len(text), max_chunk_length)]

    # Classify emotions in each chunk
    emotions = []
    for chunk in chunks:
        emotions.extend(emotion_classifier(chunk))

    return emotions

# Assuming 'df' is your dataframe and 'Speech' is the column with text
# Classify emotions in the speeches
df['Emotion'] = df['Speech'].apply(classify_emotions)

# Save the dataframe to a CSV file
df.to_csv('emotion_output.csv', index=False)

# Display the results
print(df.head())


                                 Title of the Speech  \
0                           1. I have a dream by MLK   
1             2. Tilbury Speech by Queen Elizabeth I   
2  3. Woodrow Wilson, address to Congress (April ...   
3            4. Ainât I A Woman by Sojourner Truth   
4       5. The Gettsyburg Address by Abraham Lincoln   

                                              Speech Sentiment  \
0  I have a dream that one day down in Alabama, ...  positive   
1  My loving people,\n\nWe have been persuaded b...  positive   
2  The world must be made safe for democracy. It...  positive   
3  That man over there says that women need to b...  positive   
4  Fondly do we hope, fervently do we pray, that...  positive   

                                             Emotion  
0  [{'label': 'fear', 'score': 0.4142738878726959...  
1  [{'label': 'fear', 'score': 0.9686101078987122...  
2  [{'label': 'anger', 'score': 0.435908675193786...  
3  [{'label': 'anger', 'score': 0.589276134967

In [None]:
print("hello")

hello
