In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Read the CSV file into a DataFrame
df = pd.read_csv('../data/original_raw_data.csv')

# Data Description
def data_description(df):
    description = ["Data Shape:", str(df.shape), "\nColumns:", str(df.columns), "\nData Types:", str(df.dtypes), "\nData Head:", str(df.head()), "\nData Description:", str(df.describe())]
    return "\n".join(description)

print(data_description(df))

# Early Data Exploration Report
def early_data_exploration(df):
    exploration = []
    for col in df.columns:
        exploration.extend(["\nColumn: " + col, "Unique values count: " + str(df[col].nunique()), "Value counts: ", str(df[col].value_counts().head(10))])
    return "\n".join(exploration)

print(early_data_exploration(df))

# Data Quality Report
def data_quality_report(df):
    missing_values = df.isnull().sum()
    missing_percentage = (missing_values / df.shape[0]) * 100
    data_quality = pd.DataFrame({"Missing Values": missing_values, "Percentage": missing_percentage})
    return "\nData Quality Report:\n" + str(data_quality)

print(data_quality_report(df))

# Data Visualization
def data_visualization(df):
    # Create a directory for saving plots
    import os
    if not os.path.exists('../docs/plots'):
        os.makedirs('../docs/plots')
    
    # Histograms
    for col in df.select_dtypes(include=[np.number]).columns:
        plt.figure()
        df[col].plot.hist(title=f'Histogram of {col}')
        plt.xlabel(col)
        plt.savefig(f'../docs/plots/histogram_{col}.png', bbox_inches='tight')
        plt.close()

    # Bar plots
    for col in df.select_dtypes(include=['object', 'category']).columns:
        plt.figure()
        df[col].value_counts().head(10).plot.bar(title=f'Bar Plot of {col}')
        plt.xlabel(col)
        plt.savefig(f'../docs/plots/bar_plot_{col}.png', bbox_inches='tight')
        plt.close()

    # Box plots
    for col in df.select_dtypes(include=[np.number]).columns:
        plt.figure()
        df[col].plot.box(title=f'Box Plot of {col}')
        plt.ylabel(col)
        plt.savefig(f'../docs/plots/box_plot_{col}.png', bbox_inches='tight')
        plt.close()

    # Scatter plots
    numeric_columns = df.select_dtypes(include=[np.number]).columns
    for i in range(len(numeric_columns)):
        for j in range(i + 1, len(numeric_columns)):
            plt.figure()
            df.plot.scatter(x=numeric_columns[i], y=numeric_columns[j], title=f'Scatter Plot of {numeric_columns[i]} vs {numeric_columns[j]}')
            plt.savefig(f'../docs/plots/scatter_plot_{numeric_columns[i]}_vs_{numeric_columns[j]}.png', bbox_inches='tight')
            plt.close()

    # Correlation heatmap
    plt.figure()
    corr_matrix = df.corr()
    sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
    plt.title("Correlation Heatmap")
    plt.savefig('../docs/plots/correlation_heatmap.png', bbox_inches='tight')
    plt.close()

        # Pair plots
    g = sns.pairplot(df.select_dtypes(include=[np.number]))
    g.fig.suptitle("Pair Plots", y=1.02)
    g.savefig('../docs/plots/pair_plots.png', bbox_inches='tight')
    plt.close()

    # Stacked bar plots
    cat_columns = df.select_dtypes(include=['object', 'category']).columns
    for i in range(len(cat_columns)):
        for j in range(i + 1, len(cat_columns)):
            plt.figure()
            crosstab = pd.crosstab(df[cat_columns[i]], df[cat_columns[j]])
            crosstab.plot.bar(stacked=True, title=f'Stacked Bar Plot of {cat_columns[i]} vs {cat_columns[j]}')
            plt.xlabel(cat_columns[i])
            plt.savefig(f'../docs/plots/stacked_bar_plot_{cat_columns[i]}_vs_{cat_columns[j]}.png', bbox_inches='tight')
            plt.close()

data_visualization(df)

# Save the reports to files in the specified directory
def save_reports():
    target_directory = "../docs/"
    with open(f'{target_directory}data_description.txt', 'w') as file:
        file.write(data_description(df))
    with open(f'{target_directory}early_data_exploration.txt', 'w') as file:
        file.write(early_data_exploration(df))
    with open(f'{target_directory}data_quality_report.txt', 'w') as file:
        file.write(data_quality_report(df))

save_reports()



Data Shape:
(24783, 7)

Columns:
Index(['Unnamed: 0', 'count', 'hate_speech', 'offensive_language', 'neither',
       'class', 'tweet'],
      dtype='object')

Data Types:
Unnamed: 0             int64
count                  int64
hate_speech            int64
offensive_language     int64
neither                int64
class                  int64
tweet                 object
dtype: object

Data Head:
   Unnamed: 0  count  hate_speech  offensive_language  neither  class  \
0           0      3            0                   0        3      2   
1           1      3            0                   3        0      1   
2           2      3            0                   3        0      1   
3           3      3            0                   2        1      1   
4           4      6            0                   6        0      1   

                                               tweet  
0  !!! RT @mayasolovely: As a woman you shouldn't...  
1  !!!!! RT @mleew17: boy dats cold...tyga dwn ba.

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>