# Data Analysis Project
## MySQL Database and Excel File Analysis

This notebook provides tools and examples for analysing data from:
- MySQL databases
- Excel files (.xlsx, .xls)


## 1. Install Required Libraries

Run this cell to install necessary packages if not already installed.


In [None]:
# Install required packages (uncomment if needed)
# !pip install pandas numpy mysql-connector-python sqlalchemy openpyxl matplotlib seaborn


## 2. Import Libraries


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sqlalchemy import create_engine, text
import mysql.connector
from mysql.connector import Error
import warnings

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', None)
warnings.filterwarnings('ignore')

# Set plotting style (tries multiple styles for compatibility)
try:
    plt.style.use('seaborn-v0_8-darkgrid')
except OSError:
    try:
        plt.style.use('seaborn-darkgrid')
    except OSError:
        plt.style.use('ggplot')
sns.set_palette("husl")

print("Libraries imported successfully!")


## 3. MySQL Database Connection


In [None]:
# MySQL Database Configuration
# Update these values with your database credentials

MYSQL_CONFIG = {
    'host': 'localhost',
    'database': 'your_database_name',
    'user': 'your_username',
    'password': 'your_password',
    'port': 3306
}

# Create connection string for SQLAlchemy
CONNECTION_STRING = f"mysql+mysqlconnector://{MYSQL_CONFIG['user']}:{MYSQL_CONFIG['password']}@{MYSQL_CONFIG['host']}:{MYSQL_CONFIG['port']}/{MYSQL_CONFIG['database']}"

print("Configuration set. Update the credentials above before connecting.")


### 3.1 Test MySQL Connection


In [None]:
def test_mysql_connection(config):
    """Test MySQL database connection"""
    try:
        connection = mysql.connector.connect(**config)
        if connection.is_connected():
            db_info = connection.get_server_info()
            print(f"✓ Successfully connected to MySQL Server version {db_info}")
            cursor = connection.cursor()
            cursor.execute("SELECT DATABASE();")
            record = cursor.fetchone()
            print(f"✓ Connected to database: {record[0]}")
            cursor.close()
            connection.close()
            return True
    except Error as e:
        print(f"✗ Error connecting to MySQL: {e}")
        return False

# Uncomment to test connection
# test_mysql_connection(MYSQL_CONFIG)


### 3.2 Query MySQL Database


In [None]:
def query_mysql(query, connection_string=None):
    """
    Execute a SQL query and return results as a pandas DataFrame
    
    Parameters:
    -----------
    query : str
        SQL query to execute
    connection_string : str, optional
        Database connection string. If None, uses CONNECTION_STRING
        
    Returns:
    --------
    pd.DataFrame
        Query results as a DataFrame
    """
    if connection_string is None:
        connection_string = CONNECTION_STRING
    
    try:
        engine = create_engine(connection_string)
        df = pd.read_sql(query, engine)
        engine.dispose()
        return df
    except Exception as e:
        print(f"Error executing query: {e}")
        return None

# Example: Query all tables in the database
# query = "SHOW TABLES;"
# tables = query_mysql(query)
# print(tables)


### 3.3 Example: Load Data from MySQL Table


In [None]:
# Example: Load data from a specific table
# Replace 'your_table_name' with your actual table name

# table_name = 'your_table_name'
# query = f"SELECT * FROM {table_name} LIMIT 1000;"
# df_mysql = query_mysql(query)
# print(f"Loaded {len(df_mysql)} rows from MySQL")
# df_mysql.head()


## 4. Excel File Operations


### 4.1 Read Excel File


In [None]:
def read_excel_file(file_path, sheet_name=None, header=0, skiprows=None, nrows=None):
    """
    Read an Excel file and return as pandas DataFrame
    
    Parameters:
    -----------
    file_path : str
        Path to the Excel file
    sheet_name : str, int, or list, optional
        Sheet name(s) to read. Default is first sheet
    header : int, optional
        Row to use as column names (0-indexed)
    skiprows : int or list, optional
        Rows to skip at the start
    nrows : int, optional
        Number of rows to read
        
    Returns:
    --------
    pd.DataFrame or dict
        DataFrame if single sheet, dict of DataFrames if multiple sheets
    """
    try:
        if sheet_name is None:
            # Read first sheet
            df = pd.read_excel(file_path, header=header, skiprows=skiprows, nrows=nrows)
            return df
        else:
            # Read specific sheet(s)
            if isinstance(sheet_name, list):
                # Multiple sheets
                dfs = pd.read_excel(file_path, sheet_name=sheet_name, header=header, skiprows=skiprows, nrows=nrows)
                return dfs
            else:
                # Single sheet
                df = pd.read_excel(file_path, sheet_name=sheet_name, header=header, skiprows=skiprows, nrows=nrows)
                return df
    except Exception as e:
        print(f"Error reading Excel file: {e}")
        return None

# Example: Read Excel file
# excel_file_path = 'path/to/your/file.xlsx'
# df_excel = read_excel_file(excel_file_path)
# print(f"Loaded {len(df_excel)} rows from Excel")
# df_excel.head()


### 4.2 Read Multiple Sheets from Excel


In [None]:
def read_all_excel_sheets(file_path):
    """
    Read all sheets from an Excel file
    
    Parameters:
    -----------
    file_path : str
        Path to the Excel file
        
    Returns:
    --------
    dict
        Dictionary with sheet names as keys and DataFrames as values
    """
    try:
        excel_file = pd.ExcelFile(file_path)
        sheet_names = excel_file.sheet_names
        print(f"Found {len(sheet_names)} sheet(s): {sheet_names}")
        
        dfs = {}
        for sheet in sheet_names:
            dfs[sheet] = pd.read_excel(excel_file, sheet_name=sheet)
            print(f"  - {sheet}: {len(dfs[sheet])} rows, {len(dfs[sheet].columns)} columns")
        
        return dfs
    except Exception as e:
        print(f"Error reading Excel file: {e}")
        return None

# Example: Read all sheets
# excel_file_path = 'path/to/your/file.xlsx'
# all_sheets = read_all_excel_sheets(excel_file_path)
# # Access specific sheet: all_sheets['Sheet1']


## 5. Data Exploration and Analysis


### 5.1 Basic Data Information


In [None]:
def explore_data(df, name="Dataset"):
    """
    Display comprehensive information about a DataFrame
    
    Parameters:
    -----------
    df : pd.DataFrame
        DataFrame to explore
    name : str
        Name of the dataset for display purposes
    """
    print(f"=" * 60)
    print(f"Data Exploration: {name}")
    print(f"=" * 60)
    
    print(f"\nShape: {df.shape[0]} rows × {df.shape[1]} columns")
    
    print(f"\nColumn Names and Types:")
    print(df.dtypes)
    
    print(f"\nFirst 5 rows:")
    print(df.head())
    
    print(f"\nLast 5 rows:")
    print(df.tail())
    
    print(f"\nBasic Statistics:")
    print(df.describe())
    
    print(f"\nMissing Values:")
    missing = df.isnull().sum()
    missing_pct = (missing / len(df)) * 100
    missing_df = pd.DataFrame({
        'Missing Count': missing,
        'Missing Percentage': missing_pct
    })
    missing_df = missing_df[missing_df['Missing Count'] > 0].sort_values('Missing Count', ascending=False)
    if len(missing_df) > 0:
        print(missing_df)
    else:
        print("No missing values!")
    
    print(f"\nMemory Usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

# Example usage:
# explore_data(df_mysql, "MySQL Data")
# explore_data(df_excel, "Excel Data")


### 5.2 Data Cleaning Functions


In [None]:
def clean_data(df, drop_duplicates=True, handle_missing='drop', fill_value=None):
    """
    Clean DataFrame by removing duplicates and handling missing values
    
    Parameters:
    -----------
    df : pd.DataFrame
        DataFrame to clean
    drop_duplicates : bool
        Whether to remove duplicate rows
    handle_missing : str
        'drop' to drop rows with missing values,
        'fill' to fill with fill_value,
        'keep' to keep as is
    fill_value : any
        Value to fill missing values with (if handle_missing='fill')
        
    Returns:
    --------
    pd.DataFrame
        Cleaned DataFrame
    """
    df_cleaned = df.copy()
    
    if drop_duplicates:
        before = len(df_cleaned)
        df_cleaned = df_cleaned.drop_duplicates()
        after = len(df_cleaned)
        print(f"Removed {before - after} duplicate rows")
    
    if handle_missing == 'drop':
        before = len(df_cleaned)
        df_cleaned = df_cleaned.dropna()
        after = len(df_cleaned)
        print(f"Removed {before - after} rows with missing values")
    elif handle_missing == 'fill':
        df_cleaned = df_cleaned.fillna(fill_value)
        print(f"Filled missing values with {fill_value}")
    
    return df_cleaned

# Example usage:
# df_cleaned = clean_data(df_excel, drop_duplicates=True, handle_missing='fill', fill_value=0)


### 5.3 Data Analysis Examples


In [None]:
# Example 1: Group by and aggregate
# df_grouped = df.groupby('column_name').agg({
#     'numeric_column': ['sum', 'mean', 'count']
# })
# print(df_grouped)

# Example 2: Filter data
# df_filtered = df[df['column_name'] > threshold]
# print(f"Filtered to {len(df_filtered)} rows")

# Example 3: Calculate statistics by category
# stats_by_category = df.groupby('category_column').agg({
#     'numeric_column': ['mean', 'std', 'min', 'max']
# })
# print(stats_by_category)

# Example 4: Pivot table
# pivot_table = pd.pivot_table(df, 
#                              values='value_column',
#                              index='row_column',
#                              columns='column_column',
#                              aggfunc='mean')
# print(pivot_table)


## 6. Data Visualisation


In [None]:
def plot_numeric_distribution(df, column, bins=30, figsize=(10, 6)):
    """Plot distribution of a numeric column"""
    if column not in df.columns:
        print(f"Column '{column}' not found in DataFrame")
        return
    
    fig, axes = plt.subplots(1, 2, figsize=figsize)
    
    # Histogram
    axes[0].hist(df[column].dropna(), bins=bins, edgecolor='black', alpha=0.7)
    axes[0].set_title(f'Distribution of {column}')
    axes[0].set_xlabel(column)
    axes[0].set_ylabel('Frequency')
    
    # Box plot
    axes[1].boxplot(df[column].dropna())
    axes[1].set_title(f'Box Plot of {column}')
    axes[1].set_ylabel(column)
    
    plt.tight_layout()
    plt.show()

# Example usage:
# plot_numeric_distribution(df, 'numeric_column')


In [None]:
def plot_categorical_counts(df, column, top_n=10, figsize=(10, 6)):
    """Plot counts of categorical values"""
    if column not in df.columns:
        print(f"Column '{column}' not found in DataFrame")
        return
    
    value_counts = df[column].value_counts().head(top_n)
    
    plt.figure(figsize=figsize)
    value_counts.plot(kind='bar', color='steelblue', edgecolor='black')
    plt.title(f'Top {top_n} Values in {column}')
    plt.xlabel(column)
    plt.ylabel('Count')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()

# Example usage:
# plot_categorical_counts(df, 'category_column', top_n=10)


In [None]:
def plot_correlation_heatmap(df, figsize=(12, 10)):
    """Plot correlation heatmap for numeric columns"""
    numeric_df = df.select_dtypes(include=[np.number])
    
    if len(numeric_df.columns) < 2:
        print("Need at least 2 numeric columns for correlation")
        return
    
    plt.figure(figsize=figsize)
    correlation_matrix = numeric_df.corr()
    sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm', 
                center=0, square=True, linewidths=1, cbar_kws={"shrink": 0.8})
    plt.title('Correlation Heatmap')
    plt.tight_layout()
    plt.show()

# Example usage:
# plot_correlation_heatmap(df)


## 7. Combining MySQL and Excel Data


In [None]:
def merge_dataframes(df1, df2, on=None, how='inner', suffixes=('_x', '_y')):
    """
    Merge two DataFrames
    
    Parameters:
    -----------
    df1, df2 : pd.DataFrame
        DataFrames to merge
    on : str or list
        Column(s) to join on
    how : str
        Type of merge: 'left', 'right', 'outer', 'inner'
    suffixes : tuple
        Suffixes to apply to overlapping column names
        
    Returns:
    --------
    pd.DataFrame
        Merged DataFrame
    """
    merged = pd.merge(df1, df2, on=on, how=how, suffixes=suffixes)
    print(f"Merged: {len(df1)} rows × {len(df2)} rows → {len(merged)} rows")
    return merged

# Example: Merge MySQL data with Excel data
# df_combined = merge_dataframes(df_mysql, df_excel, on='common_column', how='inner')
# df_combined.head()


## 8. Export Results


In [None]:
def export_to_excel(df, file_path, sheet_name='Sheet1', index=False):
    """Export DataFrame to Excel file"""
    try:
        df.to_excel(file_path, sheet_name=sheet_name, index=index)
        print(f"✓ Successfully exported to {file_path}")
    except Exception as e:
        print(f"✗ Error exporting to Excel: {e}")

def export_to_csv(df, file_path, index=False):
    """Export DataFrame to CSV file"""
    try:
        df.to_csv(file_path, index=index)
        print(f"✓ Successfully exported to {file_path}")
    except Exception as e:
        print(f"✗ Error exporting to CSV: {e}")

# Example usage:
# export_to_excel(df_combined, 'output/analysis_results.xlsx', sheet_name='Results')
# export_to_csv(df_combined, 'output/analysis_results.csv')


## 9. Your Analysis

Add your custom analysis code below:


In [None]:
# Load your data here
# Example:
# df_mysql = query_mysql("SELECT * FROM your_table LIMIT 1000;")
# df_excel = read_excel_file('path/to/your/file.xlsx')

# Perform your analysis
# Example:
# explore_data(df_mysql, "MySQL Data")
# explore_data(df_excel, "Excel Data")

# Visualise your data
# Example:
# plot_numeric_distribution(df, 'column_name')
# plot_correlation_heatmap(df)
