In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Data Cleaning

## 1. Merging datasets of all years

In [None]:
# Load all datasets from 2014 to 2018
data_2014 = pd.read_csv('raw_data/unzipped_files/2014_Financial_Data.csv')
data_2015 = pd.read_csv('raw_data/unzipped_files/2015_Financial_Data.csv')
data_2016 = pd.read_csv('raw_data/unzipped_files/2016_Financial_Data.csv')
data_2017 = pd.read_csv('raw_data/unzipped_files/2017_Financial_Data.csv')
data_2018 = pd.read_csv('raw_data/unzipped_files/2018_Financial_Data.csv')
# Data 2018 is already loaded as data_2018

# Add a 'Year' column to each dataset before merging
data_2014['Year'] = 2014
data_2015['Year'] = 2015
data_2016['Year'] = 2016
data_2017['Year'] = 2017
data_2018['Year'] = 2018

# Combine all data into a single DataFrame
full_df = pd.concat([data_2014, data_2015, data_2016, data_2017, data_2018], ignore_index=True)

# Display the shape and first few rows of the combined dataset
full_df.shape, full_df.head()

### Rename unnamed to Symbol

In [None]:
full_df.rename(columns={'Unnamed: 0': 'Symbol'}, inplace=True)
full_df.head()

In [None]:
# Calculate summary statistics after potential data cleaning or changes
summary_after = full_df.describe()
summary_after

## Check data types

Everything looks good, every feature corresponds to its expected data type

In [None]:
for column in full_df.columns:
    print(column, full_df[column].dtype)

## Consolidate price variations in one column

In [None]:
def get_correct_price_var(row):
    year_columns = {
        2014: '2015 PRICE VAR [%]',  # Maps 2014 to the price variation in 2015
        2015: '2016 PRICE VAR [%]',
        2016: '2017 PRICE VAR [%]',
        2017: '2018 PRICE VAR [%]',
        2018: '2019 PRICE VAR [%]'
    }
    year = row['Year']
    return row[year_columns[year]] if year in year_columns and not pd.isna(year) else np.nan

# Apply the modified function to create a new column
full_df['Latest PRICE VAR [%]'] = full_df.apply(get_correct_price_var, axis=1)

In [None]:
# Columns to drop: the yearly price variation columns
columns_to_drop = [
    '2015 PRICE VAR [%]', 
    '2016 PRICE VAR [%]', 
    '2017 PRICE VAR [%]', 
    '2018 PRICE VAR [%]', 
    '2019 PRICE VAR [%]'
]

# Drop these columns from the DataFrame
full_df = full_df.drop(columns=columns_to_drop)

# Verify by displaying the first few rows of the updated DataFrame
full_df.shape

## Check features distributions before cleaning and imputations

Let's create a function to plot the distribution and the boxplot of each feature. As per the dataset documentation we will drop our quantiles bellow 1% in each tail.

In [None]:
def plot_financial_data(df, column, num_bins=50, outlier_cap=0.01):
    # Handling outliers
    lower_bound, upper_bound = df[column].quantile([outlier_cap, 1-outlier_cap])
    df_filtered = df[(df[column] > lower_bound) & (df[column] < upper_bound)]

    # Plotting
    plt.figure(figsize=(12, 6))

    # Histogram
    plt.subplot(1, 2, 1)
    sns.histplot(df_filtered[column], bins=num_bins, kde=False, color='blue')
    plt.title(f'Histogram of {column}')
    plt.xlabel(column)
    plt.ylabel('Frequency')

    # Boxplot
    plt.subplot(1, 2, 2)
    sns.boxplot(x=df_filtered[column])
    plt.title(f'Boxplot of {column}')
    plt.xlabel(column)

    plt.tight_layout()
    plt.show()
    plt.close()  # Helps free up memory by closing the plot

def plot_by_year(df, year):
    # Subset data for the specific year
    new_df = df[df['Year'] == year]
    
    # Select only numeric columns
    numeric_cols = new_df.select_dtypes(include=[np.number]).columns

    # Plot data for each numeric column
    for column in numeric_cols:
        plot_financial_data(new_df, column)

### Plotting 2014

In [None]:
# Plot data for each numeric column in the 2014 dataset
plot_by_year(full_df, 2014)

### Plotting 2015

In [None]:
plot_by_year(full_df, 2015)

### Plotting 2016

In [None]:
plot_by_year(full_df, 2016)

### Plotting 2017

In [None]:
plot_by_year(full_df, 2017)

### Plotting 2018

In [None]:
plot_by_year(full_df, 2018)

### Now let's create a heatmap to see how features are correlated in each year (Profit)

In [None]:
 profit_features = [
        'Revenue', 'Gross Profit', 'Operating Income', 'Net Income',
        'Total assets', 'Total liabilities', 'EBITDA', 'EPS'
    ]

def plot_simplified_correlation_heatmap(df, features_of_interest, year):
    # Filter data for the specific year
    df_year = df[df['Year'] == year]

    # Select a subset of numeric columns for simplicity
    columns_of_interest = features_of_interest

    # Calculate correlation matrix for the selected columns
    correlation_matrix = df_year[columns_of_interest].corr()

    # Plotting the heatmap
    plt.figure(figsize=(10, 8))  # Larger figure size
    sns.heatmap(correlation_matrix, annot=True, cmap='viridis', fmt=".2f")
    plt.title(f'Correlation Heatmap for the Year {year}')
    plt.show()

### Heat Map 2014 (Profits)

In [None]:
# Year 2014
plot_simplified_correlation_heatmap(full_df,profit_features, 2014)

### Heat Map 2015 (Profits)

In [None]:
# Year 2015
plot_simplified_correlation_heatmap(full_df, profit_features, 2015)

### Heat Map 2016 (Profits)

In [None]:
# Year 2016
plot_simplified_correlation_heatmap(full_df, profit_features, 2016)

### Heat Map 2017 (Profits)

In [None]:
# Year 2014
plot_simplified_correlation_heatmap(full_df, profit_features, 2017)

### Heat Map 2018 (Profits)

In [None]:
# Year 2014
plot_simplified_correlation_heatmap(full_df, profit_features, 2018)

### Let's make histograms, boxplots for 30 metrics knowing that we have more than 200 features we will plot the ones that are more probable to be proxies to our study goal.

#### Let's create a dataframe and plot features that could potentialy be the proxies to make alphas over the S&P500. (Given the amount of financial indicators and ratios we have in our dataset).

##### Profitability Ratios<br><br>
Gross Profit Margin: Measures profitability after deducting the cost of goods sold.<br>
Operating Profit Margin: Measures profitability after deducting operating expenses.<br>
Net Profit Margin: Measures overall profitability after all expenses, including taxes and interest.<br>
Return on Equity (ROE): Measures the return generated on shareholders' equity.<br>
Return on Assets (ROA): Measures the efficiency of asset utilization in generating profits.<br>
Return on Investment (ROI): Measures the profitability of an investment relative to its cost.<br>
##### Liquidity Ratios<br><br>
Current Ratio: Measures a company’s ability to meet short-term obligations.<br>
Quick Ratio (Acid Test): Similar to the current ratio but excludes inventory.<br>
Cash Ratio: Measures a company's ability to meet short-term obligations with cash and cash equivalents.<br>
Solvency Ratios<br><br>
Debt-to-Equity Ratio: Measures the proportion of debt to equity financing.<br>
Interest Coverage Ratio: Measures a company's ability to meet interest payments on its debt.<br>
Debt-to-Asset Ratio: Measures the proportion of assets financed by debt.<br>
##### Efficiency Ratios<br><br>
Inventory Turnover Ratio: Measures how efficiently a company manages its inventory.<br>
Days Sales Outstanding (DSO): Measures the average number of days it takes to collect receivables.<br>
Accounts Payable Turnover: Measures how efficiently a company manages its payables.<br>
Asset Turnover Ratio: Measures how efficiently a company generates sales from its assets.<br>
##### Valuation Ratios<br><br>
Price-to-Earnings (P/E) Ratio: Measures the price investors are willing to pay for each dollar of earnings.<br>
Price-to-Book (P/B) Ratio: Compares a company's market value to its book value.<br>
Price-to-Sales (P/S) Ratio: Measures the price investors are willing to pay for each dollar of sales.<br>
Dividend Yield: Measures the annual dividend per share relative to the share price.<br>
Dividend Payout Ratio: Measures the proportion of earnings paid out as dividends.<br>
Price-to-Cash Flow (P/CF) Ratio: Compares a company's market value to its operating cash flow.<br>
Free Cash Flow Yield: Measures the free cash flow generated per share relative to the share price.<br>
##### Growth Ratios<br><br>
Earnings Per Share (EPS) Growth Rate: Measures the growth rate of earnings per share.<br>
Revenue Growth Rate: Measures the growth rate of a company's sales.<br>
Book Value Per Share Growth Rate: Measures the growth rate of a company's book value per share.<br>
##### Additional Ratios<br><br>
PEG Ratio: Compares the P/E ratio to the expected earnings growth rate.<br>
Enterprise Value (EV) to EBITDA: Compares a company's enterprise value to its earnings before interest, taxes, depreciation, and amortization.<br>

In [None]:
# List of selected features based on the importance for stock analysis
def plot_full_correlation_heatmap(df, features_of_interest, year, title):
    """ Plot a correlation heatmap for all specified features in a given year """
    # Filter data for the specific year
    df_year = df[df['Year'] == year]

    # Calculate correlation matrix for the selected columns
    correlation_matrix = df_year[features_of_interest].corr()

    # Plotting the heatmap
    plt.figure(figsize=(14, 10))  # Set figure size
    sns.heatmap(correlation_matrix, annot=False, cmap='coolwarm', fmt=".2f", linewidths=.5)
    plt.title(title)
    plt.xticks(rotation=90)  # Rotate feature names for better readability
    plt.yticks(rotation=0)
    plt.show()

selected_features = [
    'grossProfitMargin',
    'operatingProfitMargin',
    'netProfitMargin',
    'returnOnEquity',
    'returnOnAssets',
    'returnOnCapitalEmployed',
    'currentRatio',
    'quickRatio',
    'cashRatio',
    'debtEquityRatio',
    'interestCoverage',
    'debtRatio',
    'inventoryTurnover',
    'daysOfSalesOutstanding',
    'payablesTurnover',
    'assetTurnover',
    'PE ratio',
    'PB ratio',
    'priceSalesRatio',
    'dividendYield',
    'dividendPayoutRatio',
    'priceCashFlowRatio',
    'Free Cash Flow Yield',  
    'EPS Growth',
    'Revenue Growth',
    'Book Value per Share Growth',
    'priceEarningsToGrowthRatio',
    'Enterprise Value over EBITDA',
    'Year'
]
value_features_df = full_df[selected_features]

### Plot the features for year 2014

In [None]:
plot_by_year(value_features_df, 2014)

### Now let's create a heatmap to see how this features are correlated with each other

In [None]:
# Year 2014
plot_full_correlation_heatmap(full_df, selected_features, 2014, "Full Correlation Heatmap for the Year 2014")

# Observations:

## High Positive Correlations:

1. **Profit Margins and Returns:**
   - `grossProfitMargin`, `operatingProfitMargin`, and `netProfitMargin` show strong positive correlations with `returnOnEquity`, `returnOnAssets`, and `returnOnCapitalEmployed`.
   - This suggests that higher profit margins are generally associated with higher returns on equity, assets, and capital employed.

2. **Liquidity Ratios:**
   - `currentRatio`, `quickRatio`, and `cashRatio` show positive correlations with each other.
   - This indicates that they tend to move in the same direction as expected since they measure the company's ability to cover its short-term obligations.

## Negative Correlations:

1. **Debt Ratios vs. Returns:**
   - `debtEquityRatio` and `debtRatio` show negative correlations with `returnOnEquity` and `returnOnAssets`.
   - This implies that companies with higher debt tend to have lower returns on equity and assets.

2. **Profitability vs. Growth Metrics:**
   - Some growth metrics like `Revenue Growth` and `EPS Growth` do not necessarily correlate strongly with current profitability ratios.
   - This indicates that high current profits do not automatically translate to high growth rates.

## Weak or Insignificant Correlations:

1. **Valuation and Profitability:**
   - Metrics like `PE ratio`, `PB ratio`, and valuation ratios show relatively weak correlations with profitability and liquidity metrics.
   - This suggests that market valuation ratios do not always reflect the current financial operational performance directly.

2. **Dividend Metrics:**
   - `dividendYield` and `dividendPayoutRatio` do not show strong correlations with many of the profitability or liquidity metrics.
   - This indicates that dividend policies may be more influenced by other factors like management decisions or historical payout ratios rather than current financial health.


## Check missing values

Let's create a new Dataframe to use the original dataset in case we need it later.

In [None]:
new_df = full_df

let's check null values

In [None]:
initial_missing_values = full_df.isnull().sum()

let's see the percentage of missing values per feature

In [None]:
for column, value in initial_missing_values.items():
    percentage = round((value / len(full_df)) * 100, 2)
    print(f"{column}: {value} ({percentage}%)")

## Calculate the percentage of missing data by percentage of missingness.

### Missing < 10%.

In [None]:
# Calculate the percentage of missing data by percentage of missingness
def percentage_of_nan_classification(min,max,df):
    for column, value in initial_missing_values.items():
        percentage = round((value / len(df)) * 100, 2)
        if percentage >= min and percentage <= max:
            print(f"{column}: {value} ({percentage}%)")
percentage_of_nan_classification(0,10,full_df)

### Missing > 10 & Missing <= 20%.

In [None]:
percentage_of_nan_classification(10,20,full_df)

### Missing > 20 

In [None]:
percentage_of_nan_classification(20,100,full_df)

# Let's use Median and KNN to impute the data and see if the underlying distributions does not change much.

# Data Imputation and Distribution Analysis

This set of functions is designed to facilitate data imputation for datasets with missing values and to assess the impact of imputation on the statistical distribution of each feature within the datasets. The analysis primarily uses the Kolmogorov-Smirnov (KS) test to determine if imputation has significantly altered the distributions, aiding in deciding whether the imputed data is suitable for further analysis or needs additional refinement.

## Function Descriptions and Workflow

### `apply_imputation(data, imputer)`

**Purpose**: Applies specified imputation technique to numeric columns of a dataset and retains non-numeric data unchanged.

**Input**:
- `data`: DataFrame containing the dataset with missing values.
- `imputer`: An instance of `SimpleImputer` or `KNNImputer` from scikit-learn configured with the desired imputation strategy.

**Process**:
- Separates numeric and non-numeric data.
- Applies the imputation to only numeric data.
- Reintegrates imputed numeric data with the original non-numeric data.

**Output**: Returns the imputed dataset along with descriptive statistics (before and after imputation) for each numeric feature.

**Usage**: This function is typically used within a data preprocessing pipeline where handling of missing values is necessary.

### `visualize_and_compare(data_before, data_after, feature, ks_threshold, p_value_threshold)`

**Purpose**: Visualizes the distribution of a feature before and after imputation and performs the KS test to evaluate changes.

**Input**:
- `data_before`: DataFrame containing the feature values before imputation.
- `data_after`: DataFrame containing the feature values after imputation.
- `feature`: The name of the feature to analyze.
- `ks_threshold`: KS statistic threshold to decide significant change.
- `p_value_threshold`: p-value threshold to decide significant change.

**Process**:
- Generates histogram and boxplot for visual comparison.
- Calculates the KS statistic and p-value between the distributions before and after imputation.

**Output**: Returns a boolean indicating if the feature needs review and a text explanation of the KS test results.

**Usage**: Helps in visualizing how individual features are affected by imputation and whether their distribution changes are within acceptable limits.

### `analyze_ks_statistics(data_before, data_after, ks_threshold, p_value_threshold)`

**Purpose**: Applies the KS test across all numeric features to identify those that have significantly changed post-imputation.

**Input**:
- `data_before`, `data_after`: DataFrames of the dataset before and after imputation.
- `ks_threshold`, `p_value_threshold`: Thresholds for the KS statistic and p-value to flag significant changes.

**Process**:
- Iterates through each numeric feature, applying `visualize_and_compare`.

**Output**: Returns a dictionary listing features that need further review based on the KS test results.

**Usage**: Integral for automating the review process of features after imputation, ensuring data integrity.

### `impute_and_analyze(datasets, imputer_type, ks_threshold, p_value_threshold)`

**Purpose**: Coordinates the entire process of imputation and post-imputation analysis for multiple datasets.

**Input**:
- `datasets`: Dictionary of DataFrames keyed by identifiers (e.g., year or dataset name).
- `imputer_type`: Choice between 'median' or 'knn' for the type of imputation.
- `ks_threshold`, `p_value_threshold`: Criteria for significant change post-imputation.

**Process**:
- For each dataset, performs imputation, applies KS analysis, and aggregates results.

**Output**: Returns detailed results of the imputation and a list of features that require review across all datasets.

**Usage**: Suitable for batch processing of multiple datasets, providing a comprehensive overview of imputation effects and ensuring data quality for subsequent analyses.


In [None]:
from sklearn.impute import SimpleImputer, KNNImputer
from scipy.stats import ks_2samp

def apply_imputation(data, imputer):
    """Applies imputation and returns the imputed dataset along with before and after statistics."""
    numeric_data = data.select_dtypes(include=[np.number])
    non_numeric_data = data.select_dtypes(exclude=[np.number])
    stats_before = numeric_data.describe().transpose()
    imputed_data = imputer.fit_transform(numeric_data)
    imputed_df = pd.DataFrame(imputed_data, columns=numeric_data.columns)
    imputed_df = pd.concat([imputed_df, non_numeric_data], axis=1)
    stats_after = imputed_df.describe().transpose()
    return imputed_df, stats_before, stats_after

def visualize_and_compare(data_before, data_after, feature, ks_threshold=0.10, p_value_threshold=0.05):
    """Creates histograms and boxplots for the given feature before and after imputation, evaluates changes."""
    plt.figure(figsize=(14, 6))
    plt.subplot(1, 2, 1)
    bins = np.linspace(min(data_before[feature].min(), data_after[feature].min()), max(data_before[feature].max(), data_after[feature].max()), 30)
    data_before[feature].hist(alpha=0.5, bins=bins, label='Before', density=True)
    data_after[feature].hist(alpha=0.5, bins=bins, label='After', density=True)
    plt.legend()
    plt.title(f'Histogram of {feature}')

    plt.subplot(1, 2, 2)
    plt.boxplot([data_before[feature].dropna(), data_after[feature]], labels=['Before', 'After'], showfliers=True)
    plt.title(f'Boxplot of {feature}')
    plt.show()

    ks_stat, ks_pvalue = ks_2samp(data_before[feature].dropna(), data_after[feature])
    needs_review = ks_stat > ks_threshold and ks_pvalue < p_value_threshold
    explanation = (f"KS Test for {feature}: Statistic={ks_stat:.4f}, P-value={ks_pvalue:.4f}.\n"
                   f"{'Significant changes detected, review needed.' if needs_review else 'Changes are minimal, imputation appears effective.'}")
    print(explanation)
    return needs_review, explanation

def analyze_ks_statistics(data_before, data_after, ks_threshold=0.10, p_value_threshold=0.05):
    """Analyzes KS statistics for all features, identifies which need further review."""
    review_list = []
    for feature in data_before.select_dtypes(include=[np.number]).columns:
        needs_review, _ = visualize_and_compare(data_before, data_after, feature, ks_threshold, p_value_threshold)
        if needs_review:
            review_list.append(feature)
    return review_list

def impute_and_analyze(datasets, imputer_type='median', ks_threshold=0.10, p_value_threshold=0.05):
    results = {}
    review_lists = {}
    imputed_datasets = {}  # To store imputed datasets for each year
    for year, data in datasets.items():
        # Choose the imputer based on the specified type
        imputer = SimpleImputer(strategy=imputer_type) if imputer_type == 'median' else KNNImputer(n_neighbors=5)
        imputed_data, stats_before, stats_after = apply_imputation(data, imputer)
        results[year] = imputed_data
        imputed_datasets[year] = imputed_data  # Store the imputed dataset
        
        # Analyze using the corrected approach
        review_lists[year] = analyze_ks_statistics(data, imputed_data, ks_threshold, p_value_threshold)
        
        print(f"Imputation completed for {year}. Features to review due to significant changes: {len(review_lists[year])}")
        if review_lists[year]:
            print("Review required for the following features:")
            for feature in review_lists[year]:
                print(f"{feature}")
                
    return results, review_lists, imputed_datasets

#### Let's use median and see the results after imputation

In [None]:
# Lets call it
datasets = {
    '2014': data_2014,
    '2015': data_2015,
    '2016': data_2016,
    '2017': data_2017,
    '2018': data_2018,
    #'Full': full_df
}
df_results_median, review_list, df_imputed_median = impute_and_analyze(datasets, imputer_type='median')

#### Let's use KNN and see the results after imputation

In [None]:
df_results_knn, review_list_knn, df_imputed_knn = impute_and_analyze(datasets, imputer_type='knn')

# Comparison of Median and KNN Imputation Methods

In our analysis, we compared the effectiveness of median and KNN imputation methods by examining the number of features that showed significant changes after imputation. The Kolmogorov-Smirnov (KS) test was used to identify features with significant distribution changes that require further review.

In [None]:
review_list

In [None]:
print(f'Features that need review using median imputation--> 2014: {len(review_list['2014'])}, 2015: {len(review_list['2015'])}, 2016: {len(review_list['2016'])}, 2017: {len(review_list['2017'])}, 2018: {len(review_list['2018'])}')

In [None]:
review_list_knn

In [None]:
print(f'Features that need review--> 2014: {len(review_list_knn['2014'])}, 2015: {len(review_list_knn['2015'])}, 2016: {len(review_list_knn['2016'])}, 2017: {len(review_list_knn['2017'])}, 2018: {len(review_list_knn['2018'])}')

## Features Needing Review

### Median Imputation
The number of features needing review for each year using median imputation are as follows:
- **2014**: 47 features
- **2015**: 37 features
- **2016**: 53 features
- **2017**: 59 features
- **2018**: 27 features

### KNN Imputation
The number of features needing review for each year using KNN imputation are as follows:
- **2014**: 8 features
- **2015**: 4 features
- **2016**: 13 features
- **2017**: 9 features
- **2018**: 6 features

## Conclusion
The KNN imputation method resulted in fewer features needing review due to significant distribution changes compared to the median imputation method. This suggests that KNN imputation may better preserve the original data distributions in this context.

It is important to note also that if we compare the features that had more than 20% o missing values and the features that need to be reviewed are the same, this makes sense because the underlying distribution can change dramatically becase of the predictions of the missing data.

### Now let's check out new datasets

In [None]:
# Access the imputed data for a specific year (e.g., 2014)
imputed_data_2014 = df_imputed_knn['2014']
imputed_data_2015 = df_imputed_knn['2015']
imputed_data_2016 = df_imputed_knn['2016']
imputed_data_2017 = df_imputed_knn['2017']
imputed_data_2018 = df_imputed_knn['2018']

# Display the first few rows of the imputed DataFrame
imputed_data_2014.head()

In [None]:
def drop_features_to_review(imputed_datasets, review_lists):
    """Drops specified features from the imputed datasets."""
    # Combine all features that need review into a single set
    features_to_drop = set()
    for features in review_lists.values():
        features_to_drop.update(features)
    
    # Drop the features from the imputed datasets
    cleaned_datasets = {}
    for year, data in imputed_datasets.items():
        cleaned_data = data.drop(columns=features_to_drop, errors='ignore')
        cleaned_datasets[year] = cleaned_data
    
    return cleaned_datasets

In [None]:
imputed_dfs = {
    '2014': imputed_data_2014,
    '2015': imputed_data_2015,
    '2016': imputed_data_2016,
    '2017': imputed_data_2017,
    '2018': imputed_data_2018,
}
cleaned_datasets = drop_features_to_review(imputed_dfs, review_list_knn)

In [None]:
print(f'Shape for 2014: {cleaned_datasets['2014'].shape}')

In [None]:
print(f'Shape for 2015: {cleaned_datasets['2015'].shape}')

In [None]:
print(f'Shape for 2016: {cleaned_datasets['2016'].shape}')

In [None]:
print(f'Shape for 2017: {cleaned_datasets['2017'].shape}')

In [None]:
print(f'Shape for 2018: {cleaned_datasets['2018'].shape}')

In [None]:
!pip install pandas_datareader


# Outliers

From the boxplots we generated previously, we observed that there is at least one significant outlier in every feature. Identifying and handling these outliers is crucial for ensuring the quality and accuracy of our data analysis. Let's investigate these outliers and develop a strategy to address them.

We'll focus on identifying stocks with unusual price variations, verifying the organic nature of their gains, and then cleaning the dataset accordingly.


### Explanation:

#### 1. Extract Target Data
This step focuses on isolating the `PRICE VAR [%]` column and the `Sector` column for the specified year. By doing this, we can specifically analyze the price variation data for each stock in that year.

#### 2. Plot Sector Data
To identify potential outliers, we visualize the price variations for each sector. This helps us spot major peaks or valleys that might indicate unusual stock performance.

#### 3. Investigate Top Gainers
For stocks that show significant gains, we fetch their daily prices and trading volumes. By plotting these, we can verify if the growth was organic or if there were periods of no trading activity, which might indicate inorganic growth.

#### 4. Clean Dataset
After identifying stocks with inorganic gains, we remove them from the dataset. This ensures that our analysis and predictions are based on genuinely performing stocks, which is crucial for accurately predicting and potentially outperforming the S&P 500.

In [None]:
from pandas_datareader import data as pdr
import matplotlib.ticker as ticker

def clean_dataset_by_year(df, year, price_var_year, gain_threshold=500):
    """Cleans the dataset for a specific year by removing stocks with inorganic gains."""
    
    # Extract the price variation column for the given year
    price_var_col = f'{price_var_year} PRICE VAR [%]'
    df_ = df.loc[:, ['Sector', price_var_col, 'Unnamed: 0']]  # Assuming 'Unnamed: 0' is the company symbol
    
    # Get list of sectors
    sector_list = df_['Sector'].unique()
    
    # Plot the percent price variation for each sector
    for sector in sector_list:
        temp = df_[df_['Sector'] == sector]
        plt.figure(figsize=(30,5))
        plt.plot(temp[price_var_col])
        plt.title(sector.upper(), fontsize=20)
        plt.show()
    
    # Identify stocks that gained more than the threshold
    top_gainers = df_[df_[price_var_col] >= gain_threshold]
    top_gainers = top_gainers.sort_values(by=price_var_col, ascending=False)
    
    # Set date range for the given year
    date_start = f'{price_var_year}-01-01'
    date_end = f'{price_var_year}-12-31'
    tickers = top_gainers['Unnamed: 0'].values.tolist()
    
    # Verify organic growth of top gainers
    inorganic_stocks = []
    for ticker in tickers:
        try:
            daily_price = pdr.DataReader(ticker, 'yahoo', date_start, date_end)
            fig, (ax0, ax1) = plt.subplots(2, 1, gridspec_kw={'height_ratios': [3, 1]})
            ax0.plot(daily_price['Adj Close'])
            ax0.set_title(ticker, fontsize=18)
            ax0.set_ylabel('Daily Adj Close $', fontsize=14)
            ax1.plot(daily_price['Volume'])
            ax1.set_ylabel('Volume', fontsize=14)
            ax1.yaxis.set_major_formatter(ticker.StrMethodFormatter('{x:.0E}'))
            fig.align_ylabels(ax1)
            fig.tight_layout()
            plt.show()
            
            # Check for non-organic growth (flat portions in price trend)
            if daily_price['Adj Close'].diff().eq(0).all():
                inorganic_stocks.append(ticker)
        except Exception as e:
            print(f"Error fetching data for {ticker}: {e}")
            inorganic_stocks.append(ticker)
    
    # Remove inorganic gainers from the dataframe
    df = df[~df['Unnamed: 0'].isin(inorganic_stocks)]
    
    return df, inorganic_stocks

# map dictionary containing the datasets
cleaned_datasets_updated = {}
price_var_years = {
    '2014': '2015',
    '2015': '2016',
    '2016': '2017',
    '2017': '2018',
    '2018': '2019'  
}

for year, df in cleaned_datasets.items():
    cleaned_datasets_updated[year], istocks = clean_dataset_by_year(df, year, price_var_years[year])

# accessing the cleaned data for a specific year with outliers handled
cleaned_data_2014 = cleaned_datasets_updated['2014']
cleaned_data_2015 = cleaned_datasets_updated['2015']
cleaned_data_2016 = cleaned_datasets_updated['2016']
cleaned_data_2017 = cleaned_datasets_updated['2017']
cleaned_data_2018 = cleaned_datasets_updated['2018']

cleaned_data_2015.head()

In [None]:
print(f'Shape for 2014 before: {imputed_dfs['2014'].shape}')
print(f'Shape for 2014 after: {cleaned_data_2014.shape}')

In [None]:
print(f'Shape for 2015 before: {imputed_dfs['2015'].shape}')
print(f'Shape for 2015 after: {cleaned_data_2015.shape}')

In [None]:
print(f'Shape for 2016 before: {imputed_dfs['2016'].shape}')
print(f'Shape for 2016 after: {cleaned_data_2016.shape}')

In [None]:
print(f'Shape for 2017 before: {imputed_dfs['2017'].shape}')
print(f'Shape for 2017 after: {cleaned_data_2017.shape}')

In [None]:
print(f'Shape for 2018 before: {imputed_dfs['2018'].shape}')
print(f'Shape for 2018 after: {cleaned_data_2018.shape}')

In [None]:
istocks

### Outlier Management in Data Analysis

During our comprehensive review of the annual financial performance data, we have identified specific instances where stocks exhibited extreme price variations that deviate significantly from typical sector trends. These anomalies, or 'outliers,' could potentially skew our analysis and lead to less accurate forecasts.

#### Decision to Remove Outliers

To enhance the robustness and reliability of our predictive models, we have decided to remove these outliers from our dataset. This step ensures that our analysis is grounded in data that accurately reflects the majority of market behaviors without being disproportionately influenced by extreme values.

By focusing on more consistent data, we aim to provide more reliable insights and strategies for outperforming the S&P 500. This approach is crucial for maintaining the integrity of our financial models and supporting strategic investment decisions based on sound statistical principles.


In [None]:
def remove_quantile_outliers(cleaned_datasets):
    """
    Adjusts the datasets by capping and flooring the values at the 97th and 3rd percentiles respectively, handling numeric data only.

    Parameters:
    cleaned_datasets (dict): A dictionary where keys are years and values are DataFrames of the datasets for those years.

    Returns:
    dict: A dictionary of DataFrames with outliers adjusted.
    """
    cleaned_datasets_updated = {}
    
    for year, df in cleaned_datasets.items():
        # Separate numeric and non-numeric data
        numeric_data = df.select_dtypes(include=[np.number])
        non_numeric_data = df.select_dtypes(exclude=[np.number])

        # Compute quantiles and identify outliers
        top_quantiles = numeric_data.quantile(0.97)
        outliers_top = (numeric_data > top_quantiles)

        low_quantiles = numeric_data.quantile(0.03)
        outliers_low = (numeric_data < low_quantiles)

        # Apply masks to cap and floor outlier values
        numeric_data = numeric_data.mask(outliers_top, top_quantiles, axis=1)
        numeric_data = numeric_data.mask(outliers_low, low_quantiles, axis=1)
        
        # Concatenate the numeric and non-numeric data back together
        cleaned_data = pd.concat([numeric_data, non_numeric_data], axis=1)
        cleaned_datasets_updated[year] = cleaned_data

        # Optionally print or log the description of the cleaned dataset
        print(f"Data for {year} after outlier adjustment:")
        print(cleaned_data.describe())

    return cleaned_datasets_updated

# Example usage with your cleaned datasets
cleaned_datasets_with_adjusted_outliers = remove_quantile_outliers(cleaned_datasets_updated)

In [None]:
cleaned_datasets_with_adjusted_outliers['2014']

## Now Let's plot again to see ditributions and outliers after cleaning

## 2014

In [None]:
plot_by_year(cleaned_datasets_with_adjusted_outliers['2014'], 2014)

## 2015

In [None]:
plot_by_year(cleaned_datasets_with_adjusted_outliers['2015'], 2015)

## 2016

In [None]:
plot_by_year(cleaned_datasets_with_adjusted_outliers['2016'], 2016)

## 2017

In [None]:
plot_by_year(cleaned_datasets_with_adjusted_outliers['2017'], 2017)

## 2018

In [None]:
plot_by_year(cleaned_datasets_with_adjusted_outliers['2018'], 2018)

### We still have outliers but looks much better. Let's save the cleaned new datasets to start with feature engieniering and modeling in the next notebook.

In [None]:
def save_cleaned_datasets(cleaned_datasets_updated, directory="processed_data"):
    """
    Saves the cleaned datasets to the specified directory with names corresponding to their respective years.
    
    Parameters:
    cleaned_datasets_updated (dict): A dictionary where keys are years and values are DataFrames of the cleaned datasets for those years.
    directory (str): Path where files will be saved.
    """
    # Ensure the directory exists
    if not os.path.exists(directory):
        os.makedirs(directory)

    # Iterate through the dictionary to save each DataFrame
    for year, df in cleaned_datasets_updated.items():
        # Rename the 'Unnamed: 0' column to 'Symbol'
        if 'Unnamed: 0' in df.columns:
            df = df.rename(columns={'Unnamed: 0': 'Symbol'})

        # Define the filename
        filename = f"clean_df_{year}.csv"
        file_path = os.path.join(directory, filename)

        # Save the DataFrame
        df.to_csv(file_path, index=False)

        # Optionally print a confirmation
        print(f"Saved cleaned dataset for {year} to {file_path}")

# Example usage assuming cleaned_datasets_updated is defined
save_cleaned_datasets(cleaned_datasets_with_adjusted_outliers)


In [None]:
new_df = pd.read_csv('processed_data/clean_df_2014.csv')