# EDA

Here are the translations of the instructions:

- Check for missing values.
- Determine the type of each column (categorical or numerical?).
- Descriptive statistics for each column (min, max, mean, etc.).
- Scatter plots between each pair of columns and histograms for each column.
- Record count for each category.
- Check for outliers.
- Correlation between columns.

In [1]:
import pandas as pd
merged_data = pd.read_csv("merged_data.csv")

# Check for missing values
missing_values = merged_data.isnull().sum()

missing_values


Date                   0
Open                   0
High                   0
Low                    0
Close                  0
Adj Close              0
Volume                 0
Ticker                 0
SentimentScore    176469
dtype: int64

Only SentimentScore has 176,469 missing values.

In [2]:
# Check the data types of each column
data_types = merged_data.dtypes

data_types

Date               object
Open              float64
High              float64
Low               float64
Close             float64
Adj Close         float64
Volume              int64
Ticker             object
SentimentScore    float64
dtype: object

In [3]:
# Get descriptive statistics for each column
descriptive_stats = merged_data.describe()

descriptive_stats

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,SentimentScore
count,233135.0,233135.0,233135.0,233135.0,233135.0,233135.0,56666.0
mean,134.09546,135.864769,132.277744,134.11289,130.112657,11438060.0,0.013755
std,217.520329,220.320146,214.613616,217.48761,217.902406,28269150.0,0.066255
min,0.7,0.71,0.65,0.7,0.7,0.0,-0.92
25%,38.450001,38.900002,37.98,38.459999,34.619999,1503500.0,0.0
50%,72.029999,72.900002,71.15667,72.059998,66.220001,3245000.0,0.004
75%,147.979996,149.854752,145.979996,147.990005,142.8405,8550050.0,0.04
max,2697.75,2721.850098,2687.810059,2703.26001,2703.26001,1065523000.0,1.0


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Set the style of seaborn
sns.set(style="whitegrid")

# Create a pair plot
pair_plot = sns.pairplot(merged_data.dropna(), hue='Ticker')

plt.show()




In [None]:
# Check the record count for each category in the 'Ticker' column
category_counts = merged_data['Ticker'].value_counts()

category_counts

In [None]:
import numpy as np
# Define a function to detect outliers in each column using the IQR method
def detect_outliers(df, features):
    outlier_indices = []
    
    for c in features:
        # 1st quartile
        Q1 = np.percentile(df[c], 25)
        # 3rd quartile
        Q3 = np.percentile(df[c], 75)
        # IQR
        IQR = Q3 - Q1
        # Outlier step
        outlier_step = 1.5 * IQR
        # Determine a list of indices of outliers for feature col
        outlier_list_col = df[(df[c] < Q1 - outlier_step) | (df[c] > Q3 + outlier_step)].index
        # Append the found outlier indices for col to the list of outlier indices 
        outlier_indices.extend(outlier_list_col)
        
    return outlier_indices

# Detect outliers in the numerical columns
numerical_columns = ['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume', 'SentimentScore']
outlier_indices = detect_outliers(merged_data, numerical_columns)

# Create a dataframe of outliers
outliers = merged_data.loc[outlier_indices]

outliers.head()


These five rows are outliers. How to deal with these values?

In [None]:
# Select only the numeric columns
numeric_columns = merged_data.select_dtypes(include=[np.number])

# Compute the correlation matrix
correlation_matrix = numeric_columns.corr()
correlation_matrix

In [None]:
# Create a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')

plt.show()
