In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
import yfinance as yf
# from project_awesome.data.preprocessing import preprocessing_the_data

In [6]:
#Imports
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_selector
from sklearn.compose import make_column_transformer

#CLEANING THE DATA:

#drop columns that have more than 30% of missing data:
def remove_shitty_columns(df):
    for column in df.columns:
        if df[column].isnull().sum()/len(df) > 0.3:
            df.drop(columns=[column], inplace=True)
    return df


#drop rows that have more than 30% of missing values
def remove_shitty_rows(df):
    threshold = 0.3 * df.shape[1]  # Calculate the threshold for missing values
    df = df.dropna(thresh=int(df.shape[1] - threshold))
    return df

#FINAL CLEANING FUNCTION
def clean_dataframe(df):
    df = remove_shitty_columns(df)
    df = remove_shitty_rows(df)
    df = df.reset_index(drop=True)
    return df


#PREPROCESSING
def preprocessing_the_data(df):
    #use simple impute with strategy = "constant"
    imputer = SimpleImputer(strategy="constant", fill_value=0)

    #Robust Scaler to Scale data because of outliers
    rb_scaler = RobustScaler()

    #select numerical columns
    num_transformer = make_pipeline(imputer, rb_scaler)
    num_columns = make_column_selector(dtype_exclude="object")

    #Create Preproc Pipeline
    preproc_basic = make_column_transformer((num_transformer, num_columns))

    #Apply Preprocessing to dataframe
    preprocessed_data = preproc_basic.fit_transform(df)

    #Change column names of transformed dataframe
    df_preproc = pd.DataFrame(preprocessed_data, columns=preproc_basic.get_feature_names_out())

    #Merge with tickers
    df_preproc = df_preproc.join(df.Ticker)

    return df_preproc


"""to import in the other file:

from preprocessing import clean_dataframe, preprocessing_the_data

df = clean_dataframe(df)
df = propcessing_the_data(df)

"""


'to import in the other file:\n\nfrom preprocessing import clean_dataframe, preprocessing_the_data\n\ndf = clean_dataframe(df)\ndf = propcessing_the_data(df)\n\n'

In [9]:
df  = pd.read_csv('/home/jorgeluisgg/code/jorgeluisgg/project_awesome/raw_data/Raw_data_number_one.csv')

In [10]:
df

Unnamed: 0,Ticker,2023 Treasury Shares Number,2023 Ordinary Shares Number,2023 Share Issued,2023 Net Debt,2023 Total Debt,2023 Tangible Book Value,2023 Invested Capital,2023 Working Capital,2023 Net Tangible Assets,...,2024 Interest Paid Cff,2024 Interest Received Cfi,2024 Taxes Refund Paid,2024 Earnings From Equity Interest Net Of Tax,2024 Rent Expense Supplemental,2024 Rent And Landing Fees,2019 Depletion Income Statement,2024 Financial Assets,2024 Cash From Discontinued Financing Activities,2024 Accrued Interest Receivable
0,AAL,0.0,6.542732e+08,6.542732e+08,3.232400e+10,4.066300e+10,-1.134400e+10,2.770000e+10,-8.490000e+09,-1.134400e+10,...,,,,,,,,,,
1,AAME,,,,,,,,,,...,,,,,,,,,,
2,AAOI,,3.814800e+07,3.814800e+07,6.983200e+07,1.210730e+08,2.062110e+08,3.300670e+08,7.924700e+07,2.062110e+08,...,,,,,,,,,,
3,AAON,,8.150838e+07,8.150838e+07,3.804100e+07,5.055000e+07,5.852790e+08,7.735520e+08,2.822050e+08,5.852790e+08,...,,,,,,,,,,
4,AAPL,0.0,1.555006e+10,1.555006e+10,8.112300e+10,1.110880e+11,6.214600e+10,1.732340e+11,-1.742000e+09,6.214600e+10,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1763,ZKIN,,,,,,,,,,...,,,,,,,,,,
1764,ZLAB,4912200.0,9.722391e+08,9.771513e+08,,1.515100e+07,7.796600e+08,7.961180e+08,7.365390e+08,7.796600e+08,...,,,,,,,,,,
1765,ZS,,1.471690e+08,1.471690e+08,,1.210545e+09,6.100610e+08,1.859271e+09,1.354446e+09,6.100610e+08,...,,,,,,,,,,
1766,ZUMZ,,1.948900e+07,1.948900e+07,,2.542950e+08,3.363190e+08,4.073280e+08,1.943690e+08,3.363190e+08,...,,,,,,,,,,409000.0


In [11]:
df = clean_dataframe(df)
df = preprocessing_the_data(df)

In [12]:
df

Unnamed: 0,pipeline__2023 Ordinary Shares Number,pipeline__2023 Share Issued,pipeline__2023 Total Debt,pipeline__2023 Tangible Book Value,pipeline__2023 Invested Capital,pipeline__2023 Working Capital,pipeline__2023 Net Tangible Assets,pipeline__2023 Capital Lease Obligations,pipeline__2023 Common Stock Equity,pipeline__2023 Total Capitalization,...,pipeline__2022 Change In Payable,pipeline__2022 Change In Account Payable,pipeline__2022 Changes In Account Receivables,pipeline__2021 Change In Payable,pipeline__2021 Change In Account Payable,pipeline__2021 Changes In Account Receivables,pipeline__2020 Change In Payable,pipeline__2020 Change In Account Payable,pipeline__2020 Changes In Account Receivables,Ticker
0,8.004987,7.515304,55.734768,-23.148653,15.145524,-27.907990,-23.064565,139.337965,-5.430906,13.420363,...,-0.012340,-0.008001,0.046615,-0.026416,-0.013996,0.022523,0.000000,0.000000,0.000000,AAL
1,0.018042,-0.022246,-0.007525,0.157374,-0.075072,0.088309,0.156803,-0.045400,-0.056036,-0.088785,...,1.987660,2.297475,-0.930844,0.723637,0.877317,-0.650278,-0.959839,-1.144274,-1.871333,AAOI
2,0.580131,0.508216,-0.104489,0.922258,0.171553,0.751386,0.918908,0.068675,0.460284,0.185325,...,1.772945,2.049964,-4.270337,1.129854,1.360232,-0.910677,-0.618187,-0.736752,4.413111,AAON
3,201.101827,189.747335,152.564180,125.139545,96.077914,-5.861826,124.684969,-0.150992,61.394917,89.189982,...,1457.337652,1679.935100,-139.721843,1307.633248,1554.140588,-970.362756,-1165.232358,-1388.717949,1537.111111,AAPL
4,0.418674,0.398206,0.705827,4.428940,2.002865,-0.170596,4.412852,-0.150992,3.130939,2.056340,...,1.745180,1.097440,-1.496358,0.592086,-0.162149,1.875791,-5.191624,-2.063590,-5.309333,ABCB
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1644,-0.476477,-0.488941,-0.173992,-0.258718,-0.258624,-0.170596,-0.257779,-0.150992,-0.269239,-0.254178,...,1.345662,1.557419,-0.877990,-0.953617,-1.106238,0.578844,1.558635,1.908645,-1.066864,ZKIN
1645,12.126837,11.465329,-0.153160,1.314480,0.184102,2.235725,1.309705,0.121318,0.520706,0.198146,...,-8.306802,-9.569346,0.378594,6.712603,7.995335,-4.033353,11.469019,13.668718,-0.305556,ZLAB
1646,1.431302,1.311495,1.490422,0.972263,0.775329,4.254469,0.968732,1.221899,0.450251,0.802190,...,2.202375,2.544986,-10.942881,0.764057,0.925482,-10.673759,0.247275,0.294701,-12.049333,ZS
1647,-0.223838,-0.250516,0.175646,0.419906,-0.032106,0.464420,0.418381,4.419469,0.134931,-0.022750,...,-2.967916,-2.508001,0.046615,2.349883,2.608246,0.022523,0.000000,0.000000,0.000000,ZUMZ


In [16]:
names = pd.read_csv('/home/jorgeluisgg/code/jorgeluisgg/project_awesome/raw_data/nasdaq_screener.csv')
tickers_list = names[names.columns[0]].tolist()
tickers = [str(ticker) for ticker in tickers_list if "^" not in str(ticker)]
stocks = []
# Loop to fetch object for each ticker
for ticker in tickers:
    # Append the object to the list
    stocks.append(yf.Ticker(ticker))

In [26]:
prices = pd.read_csv('/home/jorgeluisgg/code/jorgeluisgg/project_awesome/raw_data/stock_prices_1y.csv')
prices