In [None]:
# Import necessary libraries
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np


# Load the dataset
mydata = 'mydata.csv'
df = pd.read_csv(mydata,encoding="latin1")

# Display the first few rows of the dataset
print("First 5 rows of the dataset:")
print(df.head())

# Explore the structure of the dataset
print("\nDataset Information:")
df.info()

# Check for missing values
print("\nMissing Values in Each Column:")
print(df.isnull().sum())

# Clean the dataset by dropping rows with missing values
df_cleaned = df.dropna()

# Verify that there are no missing values after cleaning
print("\nMissing Values After Cleaning:")
print(df_cleaned.isnull().sum())

# Show the number of rows before and after cleaning
print(f"\nNumber of rows before cleaning: {len(df)}")
print(f"Number of rows after cleaning: {len(df_cleaned)}")



# Load the dataset
iris = 'iris.csv'
data = pd.read_csv(iris,encoding="latin1")

# Compute statistics for numerical columns
statistics = data.describe()
print("Basic Statistics:\n", statistics)

# Group by a categorical column and compute mean for a numerical column
grouped_data = data.groupby('variety').mean()
print("Grouped Mean:\n", grouped_data)

# Example dataset: Iris dataset for species and petal/sepal length
from sklearn.datasets import load_iris

# Load Iris dataset
iris = load_iris()
iris_df = pd.DataFrame(
    data=np.c_[iris['data'], iris['target']],
    columns=iris['feature_names'] + ['species']
)
iris_df['species'] = iris_df['species'].map({0: 'setosa', 1: 'versicolor', 2: 'virginica'})

# Example dataset for a time-series plot
np.random.seed(42)
time_series_df = pd.DataFrame({
    'date': pd.date_range(start='2024-01-01', periods=12, freq='ME'),
    'sales': np.random.randint(200, 500, size=12)
})

# 1. Line Chart: Time-series Sales Data
plt.figure(figsize=(10, 6))
plt.plot(time_series_df['date'], time_series_df['sales'], marker='o', color='blue', label='Monthly Sales')
plt.title('Monthly Sales Over Time', fontsize=16)
plt.xlabel('Date', fontsize=14)
plt.ylabel('Sales', fontsize=14)
plt.grid(True)
plt.legend()
plt.show()


# 2. Bar Chart: Average Petal Length Per Species
avg_petal_length = iris_df.groupby('species')['petal length (cm)'].mean()

plt.figure(figsize=(8, 6))
avg_petal_length.plot(kind='bar', color=['red', 'green', 'blue'], edgecolor='black')
plt.title('Average Petal Length by Species', fontsize=16)
plt.xlabel('Species', fontsize=14)
plt.ylabel('Average Petal Length (cm)', fontsize=14)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

# 3. Histogram: Sepal Length Distribution
plt.figure(figsize=(10, 6))
sns.histplot(iris_df['sepal length (cm)'], kde=True, bins=15, color='purple')
plt.title('Distribution of Sepal Length', fontsize=16)
plt.xlabel('Sepal Length (cm)', fontsize=14)
plt.ylabel('Frequency', fontsize=14)
plt.grid(True)
plt.show()

# 4. Scatter Plot: Sepal Length vs. Petal Length
plt.figure(figsize=(10, 6))
sns.scatterplot(
    data=iris_df,
    x='sepal length (cm)',
    y='petal length (cm)',
    hue='species',
    palette='viridis',
    s=100,
    edgecolor='black'
)
plt.title('Sepal Length vs. Petal Length by Species', fontsize=16)
plt.xlabel('Sepal Length (cm)', fontsize=14)
plt.ylabel('Petal Length (cm)', fontsize=14)
plt.legend(title='Species')
plt.grid(True)
plt.show()


