# TP MM03
### Data Science Exercises
This notebook contains solutions for exercises described in the TP document, including data manipulation, visualization, and cleaning using Pandas and other libraries.

## Exercise 1: Loan Requests and Credits
### Objective
Analyze loan data to calculate financial metrics and visualize insights.

In [1]:
import pandas as pd

# Load the data
data = pd.read_csv('data_mm03_loans.csv')
data.head()

Unnamed: 0,identifiant,ville,CP,revenu,remboursement,duree,type,taux
0,0,TOULOUSE,31100,3669.0,1130.05,240,immobilier,1.168
1,1,PARIS,75009,5310.0,240.0,64,automobile,3.701
2,1,PARIS,75009,5310.0,1247.85,300,immobilier,1.173
3,2,MARSEILLE,13010,1873.0,552.54,240,immobilier,0.972
4,3,MARSEILLE,13010,1684.0,586.03,180,immobilier,1.014


### Q2: Data Enrichment

In [2]:
# Q2.1: Calculate 'taux_endettement'
data['taux_endettement'] = round((data['mensualites'] / data['revenu']) * 100, 2)
# Q2.2: Rename 'taux' to 'taux_interet'
data.rename(columns={'taux': 'taux_interet'}, inplace=True)
# Q2.3: Calculate 'cout_total'
data['cout_total'] = data['mensualites'] * data['duree']
# Q2.4: Calculate 'benefices'
data['benefices'] = (data['cout_total'] * data['taux_interet']) / (100 * 24)
data.head()

KeyError: 'mensualites'

### Q3: Preliminary Visualizations

In [None]:
import matplotlib.pyplot as plt

# Q3.1: Scatter plot for Paris and Toulouse
paris_toulouse = data[data['ville'].isin(['Paris', 'Toulouse'])]
for city in paris_toulouse['ville'].unique():
    subset = paris_toulouse[paris_toulouse['ville'] == city]
    plt.scatter(subset['revenu'], subset['taux_interet'], label=city)

plt.legend()
plt.title('Revenu vs Taux d’Intérêt (Paris & Toulouse)')
plt.xlabel('Revenu')
plt.ylabel('Taux d’Intérêt')
plt.show()

### Q4: Filtering and Manipulations

In [None]:
# Q4.1: Loans of type 'automobile'
auto_loans = data[data['type'] == 'automobile']
# Q4.2: IDs of clients with 'automobile' loans
auto_loan_ids = auto_loans['client_id']
# Q4.3: IDs and income of clients with 'automobile' loans
auto_loan_info = auto_loans[['client_id', 'revenu']]
auto_loan_info.head()

### Q5: Aggregations

In [None]:
# Q5.1: Count and average total cost of 'automobile' loans
auto_loan_count = auto_loans.shape[0]
auto_loan_avg_cost = auto_loans['cout_total'].mean()
auto_loan_count, auto_loan_avg_cost

## Exercise 2: Olympic Games Analysis
### Objective
Analyze historical Olympic data to derive insights and visualizations.

### Q1: Data Import and Null Values

In [None]:
olympic_data = pd.read_csv('data_mm03_Olympics.csv')

# Count null values per column and total
null_counts = olympic_data.isnull().sum()
total_nulls = null_counts.sum()
null_counts, total_nulls

### Q2: Data Exploration and Filtering

In [None]:
# Q2.1: Count unique sports and the least represented ones
unique_sports = olympic_data['Sport'].value_counts()
least_represented = unique_sports.tail(20)
least_represented

In [None]:
# Q2.2: Countries with <10 appearances and the most represented
country_counts = olympic_data['Team'].value_counts()
rare_countries = country_counts[country_counts < 10]
most_represented = country_counts.head(3)
rare_countries, most_represented

### Q3: Advanced Queries

In [None]:
# Q3.1: Oldest athlete and oldest medalist
oldest_athlete = olympic_data.loc[olympic_data['Age'].idxmax()]
medalist_data = olympic_data[olympic_data['Medal'].notnull()]
oldest_medalist = medalist_data.loc[medalist_data['Age'].idxmax()]
oldest_athlete, oldest_medalist

### Q4: Evolution of Medals Over Time

In [None]:
medal_counts = olympic_data.groupby(['Year', 'Season'])['Medal'].count().unstack()
medal_counts.plot(kind='line', title='Medals Over Time (Summer vs Winter)', xlabel='Year', ylabel='Count')
plt.show()

### Q5: Distribution Analysis

In [None]:
# Q5.1: Boxplots for age, height, and weight
olympic_data[['Age', 'Height', 'Weight']].plot.box(title='Distributions of Age, Height, and Weight')
plt.show()

In [None]:
boxplot_data = olympic_data[['Age', 'Height', 'Weight']]
boxplot_data.plot.box(title='Distributions of Age, Height, and Weight')
plt.show()

# Q5.2-Q5.3: Characterize distributions
distribution_stats = boxplot_data.describe().T
distribution_stats['variance'] = boxplot_data.var()
distribution_stats

In [None]:
boxplot_data.plot(kind='density', subplots=True, layout=(1, 3), sharex=False, title='Empirical Distributions')
plt.show()

In [None]:
correlation_matrix = boxplot_data.corr()
print(correlation_matrix)