In [1]:
import pandas as pd 
import numpy as np

import plotly.graph_objects as go
import os
import matplotlib.pyplot as plt

from tqdm import tqdm

pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 50)

In [2]:
def read_table(table_path, file=""):
        df = pd.read_excel(table_path, skiprows=28)
        df = df.iloc[: , 8:]
        df.drop(df.tail(49).index,
                inplace = True)

        df = df.set_index('INCOME STATEMENT').transpose()
        cols_to_drop = ['BALANCE SHEET', 'CASH FLOW STATEMENT', 'CHANGES TO SHAREHOLDER EQUITY', 'Ratios']
        df.drop(cols_to_drop, axis=1, inplace=True)
        df['Company_name'] = file
        return df

In [3]:
train_dir = 'data/oil_gas_train/'

df = pd.concat([read_table(os.path.join(train_dir, file), file) for file in tqdm(os.listdir(train_dir))])

company_names = df['Company_name'].copy()
df.columns = ['{}_{}'.format(col, i) for i, col in enumerate(df.columns)]
df.index.names = ['Date']
df = df[df.columns[df.nunique(dropna=False) > 1]]
df = df.apply(pd.to_numeric, errors='coerce')
df.iloc[:, -1] = company_names
df.shape

100%|██████████| 54/54 [00:06<00:00,  7.74it/s]


(2056, 282)

In [4]:
df.isna().sum().sum()

11442

In [5]:
def bar_plot(df, y_title: str = '', x_title: str = 'company_name'):
    fig = go.Figure(go.Bar(
        x=df.index,
        y=df.values))
    fig.update_layout(barmode='stack', yaxis={'title': y_title}, xaxis={
                      'title': x_title, 'categoryorder': 'total descending'})

    fig.show()

In [6]:
min = df.drop(['Company_name_329'], axis=1).min()
max = df.drop(['Company_name_329'], axis=1).max()
min.index = [i[:8] for i in min.index]
max.index = [i[:8] for i in max.index]

bar_plot(min,  'min value', 'feature')
bar_plot(max,  'max value', 'feature')


In [7]:
mean = df.mean()
mean.index = [i[:8] for i in mean.index]
bar_plot(mean, 'mean', 'feature')

na = df.isna().sum()
na.index = [i[:8] for i in na.index]
bar_plot(na, 'na_sum', 'feature')





In [8]:
mean_rev_by_comp = df.groupby(['Company_name_329'])['Revenue_0'].mean()
bar_plot(mean_rev_by_comp, 'mean rev')


In [9]:
min_rev_by_comp = df.groupby(['Company_name_329'])['Revenue_0'].min()
max_rev_by_comp = df.groupby(['Company_name_329'])['Revenue_0'].max()
sum_rev_by_comp = df.groupby(['Company_name_329'])['Revenue_0'].sum()
bar_plot(min_rev_by_comp, 'min rev')
bar_plot(max_rev_by_comp, 'max rev')

bar_plot(max_rev_by_comp - min_rev_by_comp, 'max-min rev')

bar_plot(sum_rev_by_comp, 'sum rev')


In [36]:
import plotly.figure_factory as ff
import numpy as np
np.random.seed(1)

x= df['Revenue_0'].values



In [37]:
df['Revenue_0']


Date
2011-03-31      0.000000
2011-06-30      0.000000
2011-09-30      0.000000
2011-12-31     61.863000
2012-03-31     48.410000
                 ...    
2019-06-30    236.260000
2019-09-30    176.942000
2019-12-31    230.844000
2020-03-31    165.187008
2020-06-30     63.129000
Name: Revenue_0, Length: 2056, dtype: float64

In [19]:
df['Revenue_0'].mean()

776.7134953769455

In [13]:
df['Revenue_0'].max()

54174.998528

In [None]:
import plotly.express as px
df = px.data.tips()
fig = px.histogram(df, x="total_bill", y="tip", color="sex", marginal="rug",
                   hover_data=df.columns)
fig.show()
