# Industries

In [21]:
#%pip install --upgrade numpy
#%pip install --upgrade pandas
#%pip install --upgrade sklearn
#%pip install --upgrade scipy

In [22]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [23]:
ADR_FEATURES_FILE = './data/adr_features.csv'
ADR_LABELS_FILE = './data/adr_labels.csv'

US_FEATURES_FILE = './data/us_features.csv'
US_LABELS_FILE = './data/us_labels.csv'

DATA_FILE = './data/us_industries.csv'

## Load features and labels

In [24]:
# -- US --
# Load features
us_features = pd.read_csv(US_FEATURES_FILE, index_col='ticker')
# Load labels (Zack's ranks)
us_labels = pd.read_csv(US_LABELS_FILE, index_col='ticker')
us_df = pd.concat([us_features, us_labels], axis=1)
print(len(us_features), len(us_labels), us_df.shape)

7146 7146 (7146, 32)


In [25]:
# -- ADR --
# Load features
adr_features = pd.read_csv(ADR_FEATURES_FILE, index_col='ticker')
# Load labels (Zack's ranks)
adr_labels = pd.read_csv(ADR_LABELS_FILE, index_col='ticker')
adr_df = pd.concat([adr_features, adr_labels], axis=1)
print(len(adr_features), len(adr_labels), adr_df.shape)

463 463 (463, 31)


In [26]:
df = pd.concat([us_df, adr_df], axis=0)
print(us_df.shape)
print(adr_df.shape)
print(df.shape)

(7146, 32)
(463, 31)
(7609, 33)


## Duplicates

In [27]:
# identify rows with duplicate indices
duplicated_indices = df.index.duplicated()
# remove rows with duplicate indices
df = df[~duplicated_indices]
df.shape

(7209, 33)

In [28]:
df.describe()

Unnamed: 0,recommendationMean,numberOfAnalystOpinions,overallRisk,beta,previousClose,trailingEps,forwardEps,trailingPE,forwardPE,pegRatio,...,fiveYearAvgDividendYield,enterpriseValue,freeCashflow,debtToEquity,returnOnAssets,returnOnEquity,quickRatio,currentRatio,zacks_rank,numberOfAnalystOpinionsoverallRisk
count,4752.0,4477.0,3491.0,4903.0,7124.0,6353.0,4811.0,3257.0,4817.0,3063.0,...,1906.0,6271.0,4605.0,4427.0,5379.0,4977.0,5192.0,5197.0,4112.0,0.0
mean,2.227504,7.211749,5.446863,1.036202,35.050753,21.294455,2.381785,inf,inf,38.260509,...,4.307319,27730580000.0,-58649390.0,373.357134,-0.039673,-0.160966,3.259989,4.086141,2.994407,
std,0.538052,6.656067,2.894461,41.276255,126.4562,1485.740776,10.360006,,,2422.023937,...,3.388737,1467883000000.0,53328590000.0,6991.958748,0.300651,8.419726,8.760655,9.363906,0.841468,
min,1.0,1.0,1.0,-2483.194,0.0023,-15245.86,-56.2,4.9e-05,-6293.5005,-64450.0,...,0.02,-46887780000000.0,-2791764000000.0,0.003,-6.98568,-385.10544,0.0,0.001,1.0,
25%,2.0,3.0,3.0,0.677693,2.63875,-1.06,-0.24,8.905172,-1.258064,-0.03,...,1.8225,92692080.0,-18763350.0,14.593,-0.05559,-0.2409,0.592,1.039,3.0,
50%,2.1,5.0,5.0,1.049186,10.65,0.02,0.81,17.330477,8.875969,0.95,...,3.33,637608000.0,7802250.0,53.82,0.0125,0.02802,1.191,1.811,3.0,
75%,2.6,10.0,8.0,1.45445,28.12,1.86,3.165,35.8,16.728971,2.22,...,5.68,4257815000.0,171817500.0,120.583,0.04618,0.13772,2.7,3.632,3.0,
max,5.0,49.0,10.0,1041.709,6190.99,110823.52,455.02,inf,inf,110145.0,...,23.94,52852890000000.0,1490633000000.0,278380.0,7.27334,201.52527,290.468,290.468,5.0,


In [30]:
#col = 'recommendationMean'
col = 'zacks_rank'
sub_df = df[['industry', col]]
sub_df.dropna(inplace=True)
sub_df.groupby(by=['industry'])[col].mean().sort_values()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sub_df.dropna(inplace=True)


industry
Real Estate—Development     1.666667
Residential Construction    1.850000
Insurance—Reinsurance       2.000000
Publishing                  2.000000
Building Materials          2.181818
                              ...   
REIT—Hotel & Motel          3.625000
Agricultural Inputs         3.727273
Paper & Paper Products      4.000000
Thermal Coal                4.000000
Food Distribution           4.000000
Name: zacks_rank, Length: 144, dtype: float64