![](https://th.bing.com/th/id/OIP.Gl6di-r90XKZrxga8g8wKQHaFq?pid=ImgDet&w=1022&h=782&rs=1)

# Import and check data

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv('/kaggle/input/nobel-prize/nobel-prize-laureates.csv',sep=';')
df.head()


In [None]:
df.isnull().sum()

In [None]:
import pandas as pd
import plotly.express as px

# Supongamos que ya tienes tu DataFrame cargado como 'df'

# Dividir las coordenadas en columnas de latitud y longitud
df[['latitude', 'longitude']] = df['Geo Point 2D'].str.split(', ', expand=True)

# Convertir las columnas a números flotantes
df['latitude'] = pd.to_numeric(df['latitude'])
df['longitude'] = pd.to_numeric(df['longitude'])

# Graficar con Plotly
fig = px.scatter_geo(df, lat='latitude', lon='longitude')
fig.show()


# Distribution of Gender

In [None]:
plt.figure(figsize=(8, 6))
df['Gender'].value_counts().plot(kind='bar')
plt.xlabel('Gender')
plt.ylabel('Count')
plt.title('Distribution of Gender')
plt.xticks(rotation=0)
plt.show()

# Top 10 Born Countries

In [None]:
plt.figure(figsize=(12, 6))
birth_country_counts = df['Born country'].value_counts()
birth_country_counts[:10].plot(kind='bar')
plt.xlabel('Born Country')
plt.ylabel('Count')
plt.title('Top 10 Born Countries')
plt.xticks(rotation=45, ha='right')
plt.show()


# Gender Distribution by Category

In [None]:
plt.figure(figsize=(10, 6))
gender_category_counts = df.groupby(['Category', 'Gender']).size().unstack()
gender_category_counts.plot(kind='bar', stacked=True)
plt.xlabel('Category')
plt.ylabel('Count')
plt.title('Gender Distribution by Category')
plt.xticks(rotation=45, ha='right')
plt.legend(title='Gender')
plt.show()

# Distribution of Birth Years

In [None]:
plt.figure(figsize=(10, 6))
df['Year'].plot(kind='hist', bins=20)
plt.xlabel('Year')
plt.ylabel('Count')
plt.title('Distribution of Birth Years')
plt.show()

# Category Distribution

In [None]:
plt.figure(figsize=(10, 6))
category_counts = df['Category'].value_counts()
category_counts.plot(kind='bar')
plt.xlabel('Category')
plt.ylabel('Count')
plt.title('Category Distribution')
plt.xticks(rotation=45, ha='right')
plt.show()


# Density Plot of Age at Death

In [None]:
plt.figure(figsize=(10, 6))
df['Age_at_Death'] = df['Died'].str.split('-').str[0].astype(float) - df['Born'].str.split('-').str[0].astype(float)
sns.histplot(data=df, x='Age_at_Death', kde=True)
plt.xlabel('Age at Death')
plt.ylabel('Density')
plt.title('Density Plot of Age at Death')
plt.show()


# Motivation nlp analysis

In [None]:
df['Motivation']

# Most common words counts

In [None]:
from collections import Counter
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string


motivation_column = df['Motivation']

combined_text = ' '.join(motivation_column)

words = word_tokenize(combined_text)

words = [word.lower() for word in words if word.isalnum()]

stop_words = set(stopwords.words('english'))
filtered_words = [word for word in words if word not in stop_words]

word_counts = Counter(filtered_words)

most_common_words = word_counts.most_common(25) 

for word, count in most_common_words:
    print(f'{word}: {count}')


# Word cloud

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

wordcloud = WordCloud(width=800, height=400, background_color='white').generate(combined_text)

plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()



# Clustering 

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

# Tu código para cargar y procesar los datos

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(motivation_column)

num_clusters = 4
kmeans = KMeans(n_clusters=num_clusters)
kmeans.fit(X)

labels = kmeans.labels_
df['Cluster'] = labels

# Reducción de dimensionalidad con PCA (3 componentes principales)
pca = PCA(n_components=3)
X_pca = pca.fit_transform(X.toarray())

# Crear un gráfico en 3D
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(111, projection='3d')
scatter = ax.scatter(X_pca[:, 0], X_pca[:, 1], X_pca[:, 2], c=labels, cmap='viridis')
ax.set_title('Clusters de Motivación')
ax.set_xlabel('Componente Principal 1')
ax.set_ylabel('Componente Principal 2')
ax.set_zlabel('Componente Principal 3')

# Agregar leyenda
legend1 = ax.legend(*scatter.legend_elements(), title='Clusters')
ax.add_artist(legend1)

plt.show()


# Sentiment analysis

In [None]:
from textblob import TextBlob


positive_count = 0
negative_count = 0
neutral_count = 0

for text in motivation_column:
    blob = TextBlob(text)
    sentiment = blob.sentiment.polarity
    
    if sentiment > 0:
        positive_count += 1
    elif sentiment < 0:
        negative_count += 1
    else:
        neutral_count += 1

total_count = len(motivation_column)
print(f"Total Texts: {total_count}")
print(f"Positive Texts: {positive_count}")
print(f"Negative Texts: {negative_count}")
print(f"Neutral Texts: {neutral_count}")
