# Setting Everything Up

In [1]:
import pandas as pd

In [2]:
# I can give a number or use None to remove maximum ceiling & display all columns
pd.options.display.max_columns = None

# # I want to be able to see the entire narrative, so remove the maximum width for each column
# pd.options.display.max_colwidth = None

# pd.options.display.float_format = '{:,.0f}'.format

%matplotlib inline

In [3]:
%load_ext rpy2.ipython
%load_ext autoreload
%autoreload 2

%matplotlib inline  
from matplotlib import rcParams
rcParams['figure.figsize'] = (8, 8)

import warnings
from rpy2.rinterface import RRuntimeWarning
warnings.filterwarnings("ignore") # Ignore all warnings
# warnings.filterwarnings("ignore", category=RRuntimeWarning) # Show some warnings

import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display, HTML

In [4]:
%%javascript
// Disable auto-scrolling
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}

<IPython.core.display.Javascript object>

In [5]:
%%R

# My commonly used R imports

require('tidyverse')

R[write to console]: Loading required package: tidyverse



── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.2     ✔ readr     2.1.4
✔ forcats   1.0.0     ✔ stringr   1.5.0
✔ ggplot2   3.4.2     ✔ tibble    3.2.1
✔ lubridate 1.9.2     ✔ tidyr     1.3.0
✔ purrr     1.0.1     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors


In [6]:
%%R

library(jpeg)
library(RColorBrewer)
library(dplyr)


In [7]:
%%R
library(plotly)
library(ggrepel)
#for timeline plotting

R[write to console]: 
Attaching package: ‘plotly’


R[write to console]: The following object is masked from ‘package:ggplot2’:

    last_plot


R[write to console]: The following object is masked from ‘package:stats’:

    filter


R[write to console]: The following object is masked from ‘package:graphics’:

    layout




The question I am interested in is:

# How are different variables associated with tax rate outcomes

### Background for how I got to this moment in the analysis:
* Mostly used 10K documents from [the SEC website](https://www.sec.gov/) <br />
* If you would like to see the basic exploratory plotting of the resulting data and some preliminary observations from the data, check the notebook in this repository titled: [sec_t_test_analysis](https://github.com/ivynyayieka/education_outcomes/blob/main/education_basic_plotting.ipynb) <br />


# Reading the data

In [9]:
df_stock_top_us_plus_sec_titles_with_links_after_copy_paste=pd.read_csv('df_stock_top_us_plus_sec_titles_merge_df_all_ten_k_document_lists_of_dicts.csv')
df_stock_top_us_plus_sec_titles_with_links_after_copy_paste

Unnamed: 0.8,Unnamed: 0.7,Unnamed: 0.6,Unnamed: 0.5,Unnamed: 0.4,Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,to_freeze_name,market_cap_rank,Unnamed: 2,Unnamed: 0,name,american_progress_list,market_cap,price,cik_str,ticker,title_searchable,title_confirmed,just_tech_or_non_tech,tech_or_non_tech,ten_k_document_link,actual_full_ten_k_document_link,most_recent_tax_benefits_from_stock_based_compensation_over_net_income,most_recent_tax_benefits_from_stock_based_compensation,tax_benefits_from_stock_based_compensation_2023,tax_benefits_from_stock_based_compensation_2022,tax_benefits_from_stock_based_compensation_2021,tax_benefits_from_stock_based_compensation_2020,tax_benefits_note,most_recent_stock_or_share_based_compensation,most_recent_stock_or_share_based_compensation_expounded,stock_or_share_based_compensation_end_of_2023,stock_or_share_based_compensation_end_of_2022,stock_or_share_based_compensation_end_of_2021,stock_or_share_based_compensation_end_of_2020,stock_notes,most_recent_tax_rates_low_or_above_ten,most_recent_tax_rates_low_or_above_ten_expounded,most_recent_tax_rates,most_recent_tax_rates_before_google,most_recent_tax_rates_expounded,tax_rates_2023,tax_rates_2022,tax_rates_2021,tax_rates_2020,most_recent_net_income,most_recent_net_income_expounded,net_income_2023,net_income_2022,net_income_2021,net_income_2020,stock_based_compensation_as_a_portion_of_net_income,python_calc_stock_based_compensation_as_a_portion_of_net_income_pct,python_calc_most_recent_tax_benefits_from_stock_based_compensation_over_net_income_pct,python_calc_stock_based_compensation_as_a_portion_of_net_income_not_pct,actual_full_ten_k_document_link_collection,table_length,list_of_stock_based_values,first_stock_based_compensation,second_stock_based_compensation,third_stock_based_compensation
0,0,0,0,0,0,0,0,Apple,1,0,0,Apple,absent,$3.030 T,$191.33,320193,AAPL,Apple Inc.,apple inc. (aapl),tech,tech,https://www.sec.gov//Archives/edgar/data/32019...,https://www.sec.gov//ix?doc=/Archives/edgar/da...,-4.0,-4002.0,,-4002,-4056,-2476,,9280.0,9280,,9280,8108,6975,,above_ten,above_ten,16.2,16.2,16.2,,16.2,13.3,14,99803.0,99803,,99803,94680,57411,9,9.298318,-4.009900,0.092983,https://www.sec.gov//ix?doc=/Archives/edgar/da...,42.0,"['Share-based compensation expense', '9,038', ...",9038,7906,6829
1,1,1,1,1,1,1,1,Microsoft,2,1,1,Microsoft,present,$2.533 T,$338.15,789019,MSFT,MICROSOFT CORP,,tech,tech,https://www.sec.gov//Archives/edgar/data/78901...,https://www.sec.gov//ix?doc=/Archives/edgar/da...,2.0,1293.0,,1293,1065,938,,7502.0,7502,,7502,6118,5289,,above_ten,above_ten,13.1,13.1,13.1,,13.1,13.8,17,72738.0,72738,,72738,61271,44281,10,10.313729,1.777613,0.103137,,,,,,
2,2,2,2,2,2,2,2,Alphabet (Google),3,2,2,Alphabet (Google),absent,$1.553 T,$122.63,1652044,GOOG,Alphabet Inc.,,tech,tech,https://www.sec.gov//Archives/edgar/data/16520...,https://www.sec.gov//ix?doc=/Archives/edgar/da...,5.0,2700.0,,2700,3100,3900,,19362.0,19362,,19362,15376,12991,,above_ten,above_ten,15.9,15.9,15.9,,15.9,16.2,,59972.0,59972,,59972,76033,,32,32.285066,4.502101,0.322851,https://www.sec.gov//ix?doc=/Archives/edgar/da...,50.0,"['Stock-based compensation expense', '12,991',...",12991,15376,19362
3,3,3,3,3,3,3,3,Amazon,4,3,3,Amazon,present,$1.337 T,$130.38,1018724,AMZN,AMAZON COM INC,,tech,tech,https://www.sec.gov//Archives/edgar/data/10187...,https://www.sec.gov//ix?doc=/Archives/edgar/da...,9.0,1900.0,,1900,2700,4300,,19621.0,19621,,19621,12757,9208,,,,6.0,6.0,unclear,,unclear,unclear,unclear,21331.0,21331,,21331,33364,-2722,92,91.983498,8.907224,0.919835,https://www.sec.gov//ix?doc=/Archives/edgar/da...,38.0,"['Stock-based compensation', '9,208', '', '', ...",9208,12757,19621
4,4,4,4,4,4,4,4,NVIDIA,5,4,4,NVIDIA,absent,$1.045 T,$423.17,1045810,NVDA,NVIDIA CORP,,tech,tech,https://www.sec.gov//Archives/edgar/data/10458...,https://www.sec.gov//ix?doc=/Archives/edgar/da...,-7.0,-309.0,-309.0,-337,-136,,,2709.0,2709,,2709,2004,1397,,,,1.9,1.9,"Furthermore, the tax effects of accounting for...",,"Furthermore, the tax effects of accounting for...",,,4368.0,4368,4368,9752,4332,,62,62.019231,-7.074176,0.620192,https://www.sec.gov//ix?doc=/Archives/edgar/da...,53.0,"['Stock-based compensation expense', '2,709', ...",2709,2004,1397
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,95,95,95,95,95,95,94,Palo Alto Networks,95,94,94,Palo Alto Networks,absent,$78.87 B,$257.88,1327567,PANW,Palo Alto Networks Inc,,tech,tech,https://www.sec.gov//Archives/edgar/data/13275...,https://www.sec.gov//ix?doc=/Archives/edgar/da...,,,,,,,,1013.4,1013.40,,1013.40,926.9,664.5,,low,low,-28.9,-28.9,-28.9,,-28.9,-7.3,-15.2,-267.0,-267,,-267,-499,-267,-380,-379.550562,,-3.795506,,,,,,
96,96,96,96,96,96,96,95,Regeneron Pharmaceuticals,96,95,95,Regeneron Pharmaceuticals,absent,$78.75 B,$717.83,872589,REGN,"REGENERON PHARMACEUTICALS, INC.",,tech,tech,https://www.sec.gov//Archives/edgar/data/87258...,https://www.sec.gov//ix?doc=/Archives/edgar/da...,,,,,,,,725.0,725,,725,601.7,432,,above_ten,above_ten,10.7,10.7,10.7,,10.7,13.4,7.8,4338.4,4338.40,,4338.40,8075.30,3513.20,17,16.711230,,0.167112,https://www.sec.gov//ix?doc=/Archives/edgar/da...,49.0,"['Stock-based compensation expense', '', '725....",725.0,601.7,432.0
97,97,97,97,97,97,97,96,Zoetis,97,96,96,Zoetis,absent,$78.25 B,$169.35,1555280,ZTS,Zoetis Inc.,,non_tech,non_tech,https://www.sec.gov//Archives/edgar/data/15552...,https://www.sec.gov//ix?doc=/Archives/edgar/da...,0.0,-8.0,,-8,-7,-7,,62.0,62,,62,58,59,,above_ten,above_ten,20.5,20.5,20.5,,20.5,18.2,18,2114.0,2114,,2114,2037,1638,3,2.932829,-0.378430,0.029328,https://www.sec.gov//ix?doc=/Archives/edgar/da...,49.0,"['Share-based compensation expense', '', '62',...",62,58,59
98,98,98,98,98,98,98,98,Fiserv,99,98,98,Fiserv,absent,$77.10 B,$124.90,798354,FI,FISERV INC,,tech,tech,https://www.sec.gov//Archives/edgar/data/79835...,https://www.sec.gov//ix?doc=/Archives/edgar/da...,,,,,,,,323.0,323,,323,239,369,,above_ten,above_ten,18.9,18.9,18.9,,18.9,21.8,17,2582.0,2582,,2582,1403,975,13,12.509682,,0.125097,https://www.sec.gov//ix?doc=/Archives/edgar/da...,60.0,"['Share-based compensation', '', '323', '', ''...",323,239,369


# Reading the data in R

In [10]:
%%R 


# df_kenya_education_r_incorrect_female <- read_csv('df_x_y_cleaned_with_teacher_ratio.csv', show_col_types = FALSE)
df_stock_top_us_plus_sec_titles_with_links_after_copy_paste_r <- read_csv('df_stock_top_us_plus_sec_titles_merge_df_all_ten_k_document_lists_of_dicts.csv', show_col_types = FALSE)

New names:
• `` -> `...1`



# Guide for regression analysis plan

# So far, I am investigating the following y variables:

1) Effective tax rates: most_recent_tax_rates
2) most_recent_tax_rates_low_or_above_ten

# So far, I am looking at working the following x variables 
* maybe  create a model to predict the y variables mentioned:
Some possible x variables could be:
1) stock_based_compensation_as_a_portion_of_net_income
2) tech_or_non_tech
3) net_income_2022

#### I am planning to do two kinds of analyses in this notebook:
1) Linear regressions for each variable I am interested in
2) Multivariable regressions to attempt to create an ideal model to predict a y variable

# For Linear regressions

In [11]:
# %%R 
# # guide for linear regression
# model <- lm(Y ~ X, data)
# summary(model)

# python_calc_stock_based_compensation_as_a_portion_of_net_income_pct (X1)

#### Stock-based compensation

In [11]:
%%R 
# guide for linear regression
model <- lm(most_recent_tax_rates ~ python_calc_stock_based_compensation_as_a_portion_of_net_income_pct, df_stock_top_us_plus_sec_titles_with_links_after_copy_paste_r)
summary(model)


Call:
lm(formula = most_recent_tax_rates ~ python_calc_stock_based_compensation_as_a_portion_of_net_income_pct, 
    data = df_stock_top_us_plus_sec_titles_with_links_after_copy_paste_r)

Residuals:
     Min       1Q   Median       3Q      Max 
-136.258   -3.366    2.765    7.528   32.014 

Coefficients:
                                                                    Estimate
(Intercept)                                                         13.97200
python_calc_stock_based_compensation_as_a_portion_of_net_income_pct  0.03740
                                                                    Std. Error
(Intercept)                                                            2.09616
python_calc_stock_based_compensation_as_a_portion_of_net_income_pct    0.01093
                                                                    t value
(Intercept)                                                           6.666
python_calc_stock_based_compensation_as_a_portion_of_net_income_pct   3.4

### Summary

* This is a simple linear regression where most_recent_tax_rates is the dependent variable (response) and stock_based_compensation_as_a_portion_of_net_income is the independent variable (predictor).

* Intercept: 14.19009

* The coefficients represent the estimated intercept and slope (or effect) of the stock_based_compensation_as_a_portion_of_net_income variable on most_recent_tax_rates: 0.03789

* The estimated regression equation would be: 
>> most_recent_tax_rates = 14.19009 + 0.03789 * stock_based_compensation_as_a_portion_of_net_income

* A low p-value (typically below 0.05) indicates that the coefficient is statistically significant. 

* In this case: Both the intercept and stock_based_compensation_as_a_portion_of_net_income coefficient have p-values less than 0.05, indicating that they are statistically significant predictors.

* Multiple R-squared represents the proportion of the variance explained by the model: just 13%

In summary, this linear regression model suggests that the predictor variable python_calc_stock_based_compensation_as_a_portion_of_net_income_pct is statistically significant in predicting the dependent variable most_recent_tax_rates. The R-squared is positive (0.1262), indicating that the model explains some of the variance in most_recent_tax_rates. The p-value for the predictor variable is below 0.05, providing evidence to reject the null hypothesis that the coefficient is zero. Therefore, based on this analysis, the predictor variable has a significant linear relationship with the dependent variable.


### Reader-facing summary
# Important
* In summary, this linear regression model suggests that there is a statistically significant relationship between python_calc_stock_based_compensation_as_a_portion_of_net_income_pct and most_recent_tax_rates. The model explains approximately 13% of the variance in most_recent_tax_rates, and the effect of stock_based_compensation_as_a_portion_of_net_income on most_recent_tax_rates is positive and statistically significant.

# python_calc_most_recent_tax_benefits_from_stock_based_compensation_over_net_income_pct (X2)

In [12]:
%%R 
# guide for linear regression
model <- lm(most_recent_tax_rates ~ python_calc_most_recent_tax_benefits_from_stock_based_compensation_over_net_income_pct, df_stock_top_us_plus_sec_titles_with_links_after_copy_paste_r)
summary(model)


Call:
lm(formula = most_recent_tax_rates ~ python_calc_most_recent_tax_benefits_from_stock_based_compensation_over_net_income_pct, 
    data = df_stock_top_us_plus_sec_titles_with_links_after_copy_paste_r)

Residuals:
     Min       1Q   Median       3Q      Max 
-134.733   -2.090    3.322    8.612   20.256 

Coefficients:
                                                                                       Estimate
(Intercept)                                                                             12.8059
python_calc_most_recent_tax_benefits_from_stock_based_compensation_over_net_income_pct   0.1518
                                                                                       Std. Error
(Intercept)                                                                                3.3011
python_calc_most_recent_tax_benefits_from_stock_based_compensation_over_net_income_pct     0.4508
                                                                                       t val

# Summary
In summary, this linear regression model suggests that the predictor variable python_calc_most_recent_tax_benefits_from_stock_based_compensation_over_net_income_pct is not statistically significant in predicting the dependent variable most_recent_tax_rates. The adjusted R-squared is very low (close to zero), indicating that the model does not explain much of the variance in most_recent_tax_rates. The p-value for the predictor variable is greater than the conventional significance level of 0.05, providing no evidence to reject the null hypothesis that the coefficient is zero. Therefore, based on this analysis, the predictor variable does not have a significant linear relationship with the dependent variable.

This is similar result to t-test though t-test was just for high vs low



# tech_or_non_tech

In [13]:
# %%R 



# df_stock_top_us_plus_sec_titles_with_links_after_copy_paste_r_tax_rate_low = df_stock_top_us_plus_sec_titles_with_links_after_copy_paste_r %>% filter(most_recent_tax_rates_low_or_above_ten=='low')
# df_stock_top_us_plus_sec_titles_with_links_after_copy_paste_tax_rate_above_ten = df_stock_top_us_plus_sec_titles_with_links_after_copy_paste_r %>% filter(most_recent_tax_rates_low_or_above_ten=='above_ten')
# t.test(df_stock_top_us_plus_sec_titles_with_links_after_copy_paste_r_tax_rate_low$just_tech_or_non_tech, df_stock_top_us_plus_sec_titles_with_links_after_copy_paste_tax_rate_above_ten$just_tech_or_non_tech)



In [14]:
%%R 
# guide for linear regression
model <- lm(most_recent_tax_rates ~ factor(just_tech_or_non_tech), df_stock_top_us_plus_sec_titles_with_links_after_copy_paste_r)
summary(model)


Call:
lm(formula = most_recent_tax_rates ~ factor(just_tech_or_non_tech), 
    data = df_stock_top_us_plus_sec_titles_with_links_after_copy_paste_r)

Residuals:
     Min       1Q   Median       3Q      Max 
-134.428   -2.812    0.843    5.053   55.772 

Coefficients:
                                  Estimate Std. Error t value Pr(>|t|)    
(Intercept)                         20.687      2.864   7.224 2.11e-10 ***
factor(just_tech_or_non_tech)tech   -8.459      3.915  -2.160   0.0336 *  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 18.11 on 84 degrees of freedom
  (14 observations deleted due to missingness)
Multiple R-squared:  0.05264,	Adjusted R-squared:  0.04136 
F-statistic: 4.667 on 1 and 84 DF,  p-value: 0.03359



# Summary
# Important
In summary, this linear regression model suggests that the predictor variable just_tech_or_non_techtech has a  significant relationship with the dependent variable most_recent_tax_rates. The adjusted R-squared is positive (0.04136), indicating that the model explains some of the variance in most_recent_tax_rates. The p-value for the predictor variable is  0.0336, which is below the typical significance level of 0.05, suggesting that the "tech" variable is statistically significant in predicting the response. Therefore, based on this analysis, the predictor variable has a statistically significant association with the dependent variable.

Based on the linear regression model summary provided, the coefficient estimate for the variable factor(just_tech_or_non_tech)tech is -8.459. Since the just_tech_or_non_tech variable is a factor that represents whether a company is categorized as "tech" or "non-tech," this negative coefficient indicates that,  being in the tech sector  is associated with lower tax rates compared to being in the non-tech sector, assuming all other factors are held constant.

It's important to note that correlation does not imply causation, and the coefficient itself does not provide information about the magnitude of the effect or the practical significance. To better understand the relationship between sector type and tax rates, further analysis and consideration of other factors are needed.

# tech_or_non_tech and stock based compensation?

In [15]:
%%R 
# guide for linear regression
model <- lm(most_recent_stock_or_share_based_compensation ~ just_tech_or_non_tech, df_stock_top_us_plus_sec_titles_with_links_after_copy_paste_r)
summary(model)


Call:
lm(formula = most_recent_stock_or_share_based_compensation ~ 
    just_tech_or_non_tech, data = df_stock_top_us_plus_sec_titles_with_links_after_copy_paste_r)

Residuals:
    Min      1Q  Median      3Q     Max 
-2863.0 -1763.2  -599.6   -82.6 17443.0 

Coefficients:
                          Estimate Std. Error t value Pr(>|t|)  
(Intercept)                  747.9      611.4   1.223   0.2251  
just_tech_or_non_techtech   1430.1      796.2   1.796   0.0764 .
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 3459 on 76 degrees of freedom
  (22 observations deleted due to missingness)
Multiple R-squared:  0.04072,	Adjusted R-squared:  0.0281 
F-statistic: 3.226 on 1 and 76 DF,  p-value: 0.07644



# Summary

In summary, this linear regression model suggests that the predictor variable just_tech_or_non_techtech has a marginally significant relationship with the dependent variable most_recent_stock_or_share_based_compensation. The adjusted R-squared is positive (0.0281), indicating that the model explains some of the variance in most_recent_stock_or_share_based_compensation, but the proportion of variance explained is relatively low. The p-value for the predictor variable is close to 0.05, providing weak evidence to reject the null hypothesis that the coefficient is zero. Therefore, based on this analysis, the predictor variable might have a slight association with the dependent variable, but the evidence for its significance is not strong.

# net_income_2022 (X2)

#### the size of net income

In [16]:
%%R 
# guide for linear regression
model <- lm(most_recent_tax_rates ~ most_recent_net_income, df_stock_top_us_plus_sec_titles_with_links_after_copy_paste_r)
summary(model)


Call:
lm(formula = most_recent_tax_rates ~ most_recent_net_income, 
    data = df_stock_top_us_plus_sec_titles_with_links_after_copy_paste_r)

Residuals:
     Min       1Q   Median       3Q      Max 
-135.843   -4.306    2.420    6.705   53.513 

Coefficients:
                        Estimate Std. Error t value Pr(>|t|)    
(Intercept)            1.446e+01  2.440e+00   5.928 6.97e-08 ***
most_recent_net_income 1.162e-04  1.236e-04   0.940     0.35    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 18.83 on 82 degrees of freedom
  (16 observations deleted due to missingness)
Multiple R-squared:  0.01067,	Adjusted R-squared:  -0.0014 
F-statistic: 0.884 on 1 and 82 DF,  p-value: 0.3499



## Summary
* This is a simple linear regression where most_recent_tax_rates is the dependent variable (response), and most_recent_net_income is the independent variable (predictor).
* The coefficient for most_recent_net_income has a p-value of 0.36, indicating that it is not statistically significant at the conventional significance level (alpha = 0.05).
* In summary, this linear regression model suggests that there is no statistically significant relationship between most_recent_net_income and most_recent_tax_rates. The model explains a very small proportion (approximately 1.12%) of the variance in most_recent_tax_rates, and the effect of most_recent_net_income on most_recent_tax_rates is not statistically significant.

In [None]:
# %%R 
# # guide for linear regression
# model <- lm(Y ~ X, data)
# summary(model)

# How about net income and stock based compensation?

In [17]:
%%R 
# guide for linear regression
model <- lm(most_recent_stock_or_share_based_compensation ~ most_recent_net_income, df_stock_top_us_plus_sec_titles_with_links_after_copy_paste_r)
summary(model)


Call:
lm(formula = most_recent_stock_or_share_based_compensation ~ 
    most_recent_net_income, data = df_stock_top_us_plus_sec_titles_with_links_after_copy_paste_r)

Residuals:
    Min      1Q  Median      3Q     Max 
-6167.7  -836.4  -415.7    35.2 16835.3 

Coefficients:
                        Estimate Std. Error t value Pr(>|t|)    
(Intercept)            287.33868  385.75618   0.745    0.459    
most_recent_net_income   0.11712    0.01942   6.030 4.81e-08 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 2889 on 80 degrees of freedom
  (18 observations deleted due to missingness)
Multiple R-squared:  0.3125,	Adjusted R-squared:  0.3039 
F-statistic: 36.36 on 1 and 80 DF,  p-value: 4.814e-08



# Summary 
* This is a simple linear regression where most_recent_stock_or_share_based_compensation is the dependent variable (response), and most_recent_net_income is the independent variable (predictor).
* The coefficient for most_recent_net_income has a very low p-value (4.81e-08), indicating that it is highly statistically significant.
* Multiple R-squared represents the proportion of the variance explained by the model: 0.3125

# Important
* In summary, this linear regression model suggests that there is a statistically significant relationship between most_recent_net_income and most_recent_stock_or_share_based_compensation. The model explains approximately 31.25% of the variance in most_recent_stock_or_share_based_compensation, and the effect of most_recent_net_income on most_recent_stock_or_share_based_compensation is statistically significant.

# Going back to what affect tax rates

In [18]:
%%R 
# guide for linear regression
model <- lm(most_recent_tax_rates ~ stock_based_compensation_as_a_portion_of_net_income, df_stock_top_us_plus_sec_titles_with_links_after_copy_paste_r)
summary(model)


Call:
lm(formula = most_recent_tax_rates ~ stock_based_compensation_as_a_portion_of_net_income, 
    data = df_stock_top_us_plus_sec_titles_with_links_after_copy_paste_r)

Residuals:
     Min       1Q   Median       3Q      Max 
-136.244   -3.356    2.772    7.519   32.020 

Coefficients:
                                                    Estimate Std. Error t value
(Intercept)                                         13.96941    2.09622   6.664
stock_based_compensation_as_a_portion_of_net_income  0.03741    0.01093   3.424
                                                    Pr(>|t|)    
(Intercept)                                         3.74e-09 ***
stock_based_compensation_as_a_portion_of_net_income 0.000998 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 18.21 on 76 degrees of freedom
  (22 observations deleted due to missingness)
Multiple R-squared:  0.1336,	Adjusted R-squared:  0.1222 
F-statistic: 11.72 on 1 and 76 DF,  p-value: 

In [None]:
# %%R 
# # guide for linear regression
# model <- lm(Y ~ X, data)
# summary(model)

In [27]:
%%R df_stock_top_us_plus_sec_titles_with_links_after_copy_paste_r_integer
# guide for linear regression
model <- lm(stock_based_compensation_as_a_portion_of_net_income ~ stock_based_compensation_as_a_portion_of_net_income, df_stock_top_us_plus_sec_titles_with_links_after_copy_paste_r_integer)
summary(model)


Call:
lm(formula = stock_based_compensation_as_a_portion_of_net_income ~ 
    stock_based_compensation_as_a_portion_of_net_income, data = df_stock_top_us_plus_sec_titles_with_links_after_copy_paste_r_integer)

Residuals:
   Min     1Q Median     3Q    Max 
-187.1 -173.8 -169.6 -130.6 1392.9 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)
(Intercept)    183.1      131.3   1.395    0.191

Residual standard error: 454.8 on 11 degrees of freedom
  (2 observations deleted due to missingness)



In [21]:
df_stock_top_us_plus_sec_titles_with_links_after_copy_paste_r_integer

Unnamed: 0.1,to_freeze_name,market_cap_rank,...3,Unnamed: 0,name,american_progress_list,market_cap,price,cik_str,ticker,title_searchable,title_confirmed,tech_or_non_tech,ten_k_document_link,actual_full_ten_k_document_link,most_recent_tax_benefits_from_stock_based_compensation,tax_benefits_from_stock_based_compensation_2023,tax_benefits_from_stock_based_compensation_2022,tax_benefits_from_stock_based_compensation_2021,tax_benefits_from_stock_based_compensation_2020,tax_benefits_note,most_recent_stock_or_share_based_compensation,stock_or_share_based_compensation_end_of_2023,stock_or_share_based_compensation_end_of_2022,stock_or_share_based_compensation_end_of_2021,stock_or_share_based_compensation_end_of_2020,stock_notes,most_recent_tax_rates_low_or_above_ten,most_recent_tax_rates,tax_rates_2023,tax_rates_2022,tax_rates_2021,tax_rates_2020,most_recent_net_income,net_income_2023,net_income_2022,net_income_2021,net_income_2020,stock_based_compensation_as_a_portion_of_net_income
1,Tesla,6.0,5.0,5.0,Tesla,absent,$895.32 B,$282.48,1318605.0,TSLA,"Tesla, Inc.",NA_character_,tech,https://www.sec.gov//Archives/edgar/data/13186...,https://www.sec.gov//ix?doc=/Archives/edgar/da...,NA_character_,-2147483648,NA_character_,NA_character_,NA_character_,Our income tax benefits recognized from stock-...,1560,,1560,2121,1734,NA_character_,low,8,,8,11,25,12587,,12587,5644,862,12.0
2,Visa,9.0,8.0,8.0,Visa,absent,$500.46 B,$239.45,1403161.0,V,VISA INC.,NA_character_,tech,https://www.sec.gov//Archives/edgar/data/14031...,https://www.sec.gov//ix?doc=/Archives/edgar/da...,82,-2147483648,82,73,63,NA_character_,602,,602,542,416,NA_character_,above_ten,18,,18,23,21,14957,,14957,12311,10866,4.0
3,Exxon Mobil,12.0,11.0,11.0,Exxon Mobil,present,$432.23 B,$106.91,34088.0,XOM,EXXON MOBIL CORP,NA_character_,non_tech,NA_character_,https://www.sec.gov/ix?doc=/Archives/edgar/dat...,52,-2147483648,52,49,51,NA_character_,648,,648,612,672,The compensation cost charged against income f...,above_ten,33,,33,31,17,55740,,55740,23040,-22440,1.0
4,Salesforce,31.0,30.0,30.0,Salesforce,absent,$208.26 B,$213.82,1108524.0,CRM,"Salesforce, Inc.",NA_character_,tech,https://www.sec.gov//Archives/edgar/data/11085...,https://www.sec.gov//ix?doc=/Archives/edgar/da...,NA_character_,-2147483648,NA_character_,NA_character_,NA_character_,NA_character_,3279,3279.0,2779,2190,NA_character_,NA_character_,above_ten,68,68.0,6,NA_character_,NA_character_,208,208.0,1444,4072,NA_character_,1576.0
5,Netflix,34.0,33.0,33.0,Netflix,absent,$198.22 B,$445.90,1065280.0,NFLX,NETFLIX INC,NA_character_,tech,https://www.sec.gov//Archives/edgar/data/10652...,https://www.sec.gov//ix?doc=/Archives/edgar/da...,NA_character_,-2147483648,NA_character_,NA_character_,NA_character_,NA_character_,575.452,,575.452,403.22,415.18,NA_character_,above_ten,15,,15,12,14,4492,,4492,5116,2761,13.0
6,Comcast,38.0,37.0,37.0,Comcast,absent,$172.92 B,$41.48,1166691.0,CMCSA,COMCAST CORP,NA_character_,tech,NA_character_,https://www.sec.gov/ix?doc=/Archives/edgar/dat...,NA_character_,-2147483648,NA_character_,NA_character_,NA_character_,NA_character_,1336,,1336,1315,1193,NA_character_,above_ten,47,,47,27.5,NA_character_,4925,,4925,13833,10701,27.0
7,Nextera Energy,46.0,45.0,45.0,Nextera Energy,absent,$151.65 B,$74.95,753308.0,NEE,NEXTERA ENERGY INC,NA_character_,tech,https://www.sec.gov//Archives/edgar/data/75330...,https://www.sec.gov//ix?doc=/Archives/edgar/da...,NA_character_,-2147483648,NA_character_,NA_character_,NA_character_,NA_character_,unclear,,unclear,unclear,unclear,NA_character_,above_ten,15,,15,11,NA_character_,3246,,3246,2827,2369,
8,QUALCOMM,55.0,54.0,54.0,QUALCOMM,absent,$130.48 B,$117.13,804328.0,QCOM,QUALCOMM INC/DE,NA_character_,tech,https://www.sec.gov//Archives/edgar/data/80432...,https://www.sec.gov//ix?doc=/Archives/edgar/da...,NA_character_,-2147483648,NA_character_,NA_character_,NA_character_,NA_character_,-489,,-489,-435,-238,NA_character_,above_ten,13,,13,12,NA_character_,12936,,12936,9043,5198,-4.0
9,Intuit,57.0,56.0,56.0,Intuit,absent,$128.22 B,$457.84,896878.0,INTU,INTUIT INC.,NA_character_,tech,https://www.sec.gov//Archives/edgar/data/89687...,https://www.sec.gov//ix?doc=/Archives/edgar/da...,NA_character_,-2147483648,NA_character_,NA_character_,NA_character_,NA_character_,1308,,1308,753,435,NA_character_,above_ten,19,,19,19,NA_character_,2066,,2066,2062,1826,63.0
10,ServiceNow,69.0,68.0,68.0,ServiceNow,absent,$115.26 B,$565.76,1373715.0,NOW,"ServiceNow, Inc.",NA_character_,tech,https://www.sec.gov//Archives/edgar/data/13737...,https://www.sec.gov//ix?doc=/Archives/edgar/da...,NA_character_,-2147483648,NA_character_,NA_character_,NA_character_,NA_character_,1401,,1401,1131,870,NA_character_,above_ten,19,,19,8,NA_character_,325,,325,230,119,431.0


# net_income_2022 (X2)  

#### the size of net income

In [22]:
%%R -o df_stock_top_us_plus_sec_titles_with_links_after_copy_paste_r

# Filter the data frame to include only rows with integer values in 'most_recent_tax_rates'
df_stock_top_us_plus_sec_titles_with_links_after_copy_paste_r <- df_stock_top_us_plus_sec_titles_with_links_after_copy_paste_r %>%
  filter(grepl("^\\d+$", most_recent_net_income))

# Now, df_stock_top_us_plus_sec_titles_with_links_after_copy_paste_r_integer contains only rows where 'most_recent_tax_rates' are integers.



In [23]:
df_stock_top_us_plus_sec_titles_with_links_after_copy_paste_r

Unnamed: 0.7,...1,Unnamed: 0.6,Unnamed: 0.5,Unnamed: 0.4,Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,to_freeze_name,market_cap_rank,Unnamed: 2,Unnamed: 0,name,american_progress_list,market_cap,price,cik_str,ticker,title_searchable,title_confirmed,just_tech_or_non_tech,tech_or_non_tech,ten_k_document_link,actual_full_ten_k_document_link,most_recent_tax_benefits_from_stock_based_compensation_over_net_income,most_recent_tax_benefits_from_stock_based_compensation,tax_benefits_from_stock_based_compensation_2023,tax_benefits_from_stock_based_compensation_2022,tax_benefits_from_stock_based_compensation_2021,tax_benefits_from_stock_based_compensation_2020,tax_benefits_note,most_recent_stock_or_share_based_compensation,most_recent_stock_or_share_based_compensation_expounded,stock_or_share_based_compensation_end_of_2023,stock_or_share_based_compensation_end_of_2022,stock_or_share_based_compensation_end_of_2021,stock_or_share_based_compensation_end_of_2020,stock_notes,most_recent_tax_rates_low_or_above_ten,most_recent_tax_rates_low_or_above_ten_expounded,most_recent_tax_rates,most_recent_tax_rates_before_google,most_recent_tax_rates_expounded,tax_rates_2023,tax_rates_2022,tax_rates_2021,tax_rates_2020,most_recent_net_income,most_recent_net_income_expounded,net_income_2023,net_income_2022,net_income_2021,net_income_2020,stock_based_compensation_as_a_portion_of_net_income,python_calc_stock_based_compensation_as_a_portion_of_net_income_pct,python_calc_most_recent_tax_benefits_from_stock_based_compensation_over_net_income_pct,python_calc_stock_based_compensation_as_a_portion_of_net_income_not_pct,actual_full_ten_k_document_link_collection,table_length,list_of_stock_based_values,first_stock_based_compensation,second_stock_based_compensation,third_stock_based_compensation
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Apple,1.0,0.0,0.0,Apple,absent,$3.030 T,$191.33,320193.0,AAPL,Apple Inc.,apple inc. (aapl),tech,tech,https://www.sec.gov//Archives/edgar/data/32019...,https://www.sec.gov//ix?doc=/Archives/edgar/da...,-4.0,-4002.0,,-4002,-4056,-2476,NA_character_,9280.0,9280,,9280,8108,6975,NA_character_,above_ten,above_ten,16.2,16.2,16.2,,16.2,13.3,14,99803.0,99803,,99803,94680,57411,9.0,9.298318,-4.009900,0.092983,https://www.sec.gov//ix?doc=/Archives/edgar/da...,42.0,"['Share-based compensation expense', '9,038', ...",9038.0,7906.0,6829.0
2,1.0,1.0,1.0,1.0,1.0,1.0,1.0,Microsoft,2.0,1.0,1.0,Microsoft,present,$2.533 T,$338.15,789019.0,MSFT,MICROSOFT CORP,NA_character_,tech,tech,https://www.sec.gov//Archives/edgar/data/78901...,https://www.sec.gov//ix?doc=/Archives/edgar/da...,2.0,1293.0,,1293,1065,938,NA_character_,7502.0,7502,,7502,6118,5289,NA_character_,above_ten,above_ten,13.1,13.1,13.1,,13.1,13.8,17,72738.0,72738,,72738,61271,44281,10.0,10.313729,1.777613,0.103137,NA_character_,,NA_character_,,,
3,2.0,2.0,2.0,2.0,2.0,2.0,2.0,Alphabet (Google),3.0,2.0,2.0,Alphabet (Google),absent,$1.553 T,$122.63,1652044.0,GOOG,Alphabet Inc.,NA_character_,tech,tech,https://www.sec.gov//Archives/edgar/data/16520...,https://www.sec.gov//ix?doc=/Archives/edgar/da...,5.0,2700.0,,2700,3100,3900,NA_character_,19362.0,19362,,19362,15376,12991,NA_character_,above_ten,above_ten,15.9,15.9,15.9,,15.9,16.2,NA_character_,59972.0,59972,,59972,76033,NA_character_,32.0,32.285066,4.502101,0.322851,https://www.sec.gov//ix?doc=/Archives/edgar/da...,50.0,"['Stock-based compensation expense', '12,991',...",12991.0,15376.0,19362.0
4,3.0,3.0,3.0,3.0,3.0,3.0,3.0,Amazon,4.0,3.0,3.0,Amazon,present,$1.337 T,$130.38,1018724.0,AMZN,AMAZON COM INC,NA_character_,tech,tech,https://www.sec.gov//Archives/edgar/data/10187...,https://www.sec.gov//ix?doc=/Archives/edgar/da...,9.0,1900.0,,1900,2700,4300,NA_character_,19621.0,19621,,19621,12757,9208,NA_character_,NA_character_,NA_character_,6.0,6.0,unclear,,unclear,unclear,unclear,21331.0,21331,,21331,33364,-2722,92.0,91.983498,8.907224,0.919835,https://www.sec.gov//ix?doc=/Archives/edgar/da...,38.0,"['Stock-based compensation', '9,208', '', '', ...",9208.0,12757.0,19621.0
5,4.0,4.0,4.0,4.0,4.0,4.0,4.0,NVIDIA,5.0,4.0,4.0,NVIDIA,absent,$1.045 T,$423.17,1045810.0,NVDA,NVIDIA CORP,NA_character_,tech,tech,https://www.sec.gov//Archives/edgar/data/10458...,https://www.sec.gov//ix?doc=/Archives/edgar/da...,-7.0,-309.0,-309.0,-337,-136,NA_character_,NA_character_,2709.0,2709,,2709,2004,1397,NA_character_,NA_character_,NA_character_,1.9,1.9,"Furthermore, the tax effects of accounting for...",,"Furthermore, the tax effects of accounting for...",NA_character_,NA_character_,4368.0,4368,4368.0,9752,4332,NA_character_,62.0,62.019231,-7.074176,0.620192,https://www.sec.gov//ix?doc=/Archives/edgar/da...,53.0,"['Stock-based compensation expense', '2,709', ...",2709.0,2004.0,1397.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74,92.0,92.0,92.0,92.0,92.0,92.0,91.0,Cigna,92.0,91.0,91.0,Cigna,absent,$83.06 B,$280.75,1739940.0,CI,Cigna Group,NA_character_,non_tech,non_tech,https://www.sec.gov//Archives/edgar/data/17399...,https://www.sec.gov//ix?doc=/Archives/edgar/da...,,,,NA_character_,NA_character_,NA_character_,NA_character_,264.0,264,,264,268,289,NA_character_,above_ten,above_ten,19.2,19.2,19.2,,19.2,20.2,22,6746.0,6746,,6746,5415,8489,4.0,3.913430,,0.039134,NA_character_,,NA_character_,,,
75,93.0,93.0,93.0,93.0,93.0,93.0,92.0,Airbnb,93.0,92.0,92.0,Airbnb,absent,$82.99 B,$131.69,1559720.0,ABNB,"Airbnb, Inc.",NA_character_,tech,tech,NA_character_,https://www.sec.gov/ix?doc=/Archives/edgar/dat...,2.0,39.9,,39.9,35.6,19,NA_character_,930.0,930,,930,899,3003,NA_character_,low,low,5.0,5.0,5,,5,-17,NA_character_,1893.0,1893,,1893,-352,NA_character_,49.0,49.128368,2.107765,0.491284,NA_character_,,NA_character_,,,
76,94.0,94.0,94.0,94.0,94.0,94.0,93.0,Altria Group,94.0,93.0,93.0,Altria Group,absent,$82.07 B,$45.98,764180.0,MO,"ALTRIA GROUP, INC.",NA_character_,non_tech,non_tech,https://www.sec.gov//Archives/edgar/data/76418...,https://www.sec.gov//ix?doc=/Archives/edgar/da...,,,,NA_character_,NA_character_,NA_character_,NA_character_,,unclear,,unclear,unclear,unclear,NA_character_,above_ten,above_ten,22.0,22.0,22,,22,35.3,35,5764.0,5764,,5764,2475,4454,,,,,NA_character_,,NA_character_,,,
77,97.0,97.0,97.0,97.0,97.0,97.0,96.0,Zoetis,97.0,96.0,96.0,Zoetis,absent,$78.25 B,$169.35,1555280.0,ZTS,Zoetis Inc.,NA_character_,non_tech,non_tech,https://www.sec.gov//Archives/edgar/data/15552...,https://www.sec.gov//ix?doc=/Archives/edgar/da...,0.0,-8.0,,-8,-7,-7,NA_character_,62.0,62,,62,58,59,NA_character_,above_ten,above_ten,20.5,20.5,20.5,,20.5,18.2,18,2114.0,2114,,2114,2037,1638,3.0,2.932829,-0.378430,0.029328,https://www.sec.gov//ix?doc=/Archives/edgar/da...,49.0,"['Share-based compensation expense', '', '62',...",62.0,58.0,59.0


In [25]:
%%R 
# guide for linear regression
model <- lm(most_recent_tax_rates ~ most_recent_net_income, df_stock_top_us_plus_sec_titles_with_links_after_copy_paste_r)
summary(model)


Call:
lm(formula = most_recent_tax_rates ~ most_recent_net_income, 
    data = df_stock_top_us_plus_sec_titles_with_links_after_copy_paste_r)

Residuals:
    Min      1Q  Median      3Q     Max 
-21.998  -5.730  -0.114   3.383  48.892 

Coefficients:
                         Estimate Std. Error t value Pr(>|t|)    
(Intercept)             1.912e+01  1.425e+00  13.418   <2e-16 ***
most_recent_net_income -3.972e-05  6.799e-05  -0.584    0.561    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 9.75 on 71 degrees of freedom
  (5 observations deleted due to missingness)
Multiple R-squared:  0.004783,	Adjusted R-squared:  -0.009234 
F-statistic: 0.3413 on 1 and 71 DF,  p-value: 0.561

