In [1]:
import pandas as pd
import yfinance as yf
from pandas.tseries.offsets import BDay

In [2]:
df_item = pd.read_csv("df_item.csv")

In [3]:
# df_item.to_csv("df_item_sample.csv")

In [4]:
df_item

Unnamed: 0,Ticker,Accession Number,Date,Content
0,LANC,0000057515-22-000012,20220817,"Item8.01 Other EventsOn August 17, 2022, Lanca..."
1,LANC,0000057515-22-000024,20221110,Item5.07 Submission of Matters to a Vote of Se...
2,LANC,0000057515-20-000023,20200827,Item2.02 Results of Operations and Financial C...
3,LANC,0000057515-21-000020,20211103,Item2.02 Results of Operations and Financial C...
4,LANC,0000057515-20-000014,20200505,Item2.02 Results of Operations and Financial C...
...,...,...,...,...
6701,VNO,0000899689-21-000032,20210504,Item2.02. Results of Operations and Financial ...
6702,VNO,0000899689-21-000035,20210521,Item5.07. Submission of Matters to a Vote of S...
6703,VNO,0000899689-21-000005,20210129,Item2.02. Results of Operations and Financial ...
6704,VNO,0000899689-20-000036,20200720,Item2.02. Results of Operations and Financial ...


In [5]:
df_item.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6706 entries, 0 to 6705
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Ticker            6706 non-null   object
 1   Accession Number  6706 non-null   object
 2   Date              6706 non-null   int64 
 3   Content           6706 non-null   object
dtypes: int64(1), object(3)
memory usage: 209.7+ KB


### Only keep rows with content starting with X.0Y

In [6]:
df_cleaned = df_item[df_item['Content'].str.match(r'^Item\d+\.\d+.*')]

In [7]:
df_cleaned

Unnamed: 0,Ticker,Accession Number,Date,Content
0,LANC,0000057515-22-000012,20220817,"Item8.01 Other EventsOn August 17, 2022, Lanca..."
1,LANC,0000057515-22-000024,20221110,Item5.07 Submission of Matters to a Vote of Se...
2,LANC,0000057515-20-000023,20200827,Item2.02 Results of Operations and Financial C...
3,LANC,0000057515-21-000020,20211103,Item2.02 Results of Operations and Financial C...
4,LANC,0000057515-20-000014,20200505,Item2.02 Results of Operations and Financial C...
...,...,...,...,...
6701,VNO,0000899689-21-000032,20210504,Item2.02. Results of Operations and Financial ...
6702,VNO,0000899689-21-000035,20210521,Item5.07. Submission of Matters to a Vote of S...
6703,VNO,0000899689-21-000005,20210129,Item2.02. Results of Operations and Financial ...
6704,VNO,0000899689-20-000036,20200720,Item2.02. Results of Operations and Financial ...


### Convert "Date" int to datetime object

In [8]:
df_cleaned['Date'] = pd.to_datetime(df_cleaned['Date'].astype(str), format='%Y%m%d')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['Date'] = pd.to_datetime(df_cleaned['Date'].astype(str), format='%Y%m%d')


In [9]:
df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6426 entries, 0 to 6705
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   Ticker            6426 non-null   object        
 1   Accession Number  6426 non-null   object        
 2   Date              6426 non-null   datetime64[ns]
 3   Content           6426 non-null   object        
dtypes: datetime64[ns](1), object(3)
memory usage: 251.0+ KB


### Check Duplicate Assession Number

In [10]:
# Checking for duplicate Accession Numbers
duplicated_rows = df_cleaned[df_cleaned.duplicated(subset=['Ticker', 'Accession Number'], keep=False)]
duplicated_rows

Unnamed: 0,Ticker,Accession Number,Date,Content
745,EHC,0000785161-22-000042,2022-10-13,Item1.01. Entry into a Material Definitive Agr...
746,EHC,0000785161-22-000042,2022-10-13,Item1.01 of Form-8-K. A copy of any omitted Sc...
1063,UNM,0000005513-20-000031,2020-02-24,Item5.03Amendments to Articles of Incorporatio...
1064,UNM,0000005513-20-000031,2020-02-24,Item9.01Financial Statements and Exhibits.(d) ...
1751,RYN,0000052827-21-000109,2021-05-20,Item5.07. Submission of Matters to a Vote of S...
...,...,...,...,...
6595,VAL,0000314808-22-000121,2022-10-31,Item7.01 Regulation FD DisclosureThe Fleet Sta...
6596,VAL,0000314808-22-000126,2022-11-01,Item2.02 Results of Operations and Financial C...
6597,VAL,0000314808-22-000126,2022-11-01,Item2.02 Results of Operations and Financial C...
6600,VAL,0000314808-21-000035,2021-05-03,Item7.01 Regulation FD Disclosure2Item 9.01 Fi...


In [11]:
# Printing the full content for each group of duplicates
duplicate_details = {}

for (ticker, accession_number), group in duplicated_rows.groupby(['Ticker', 'Accession Number']):
    duplicate_details[(ticker, accession_number)] = group['Content'].tolist()

duplicate_details

{('BDC',
  '0000913142-20-000002'): ['Item5.02Departure of Directors or Certain Officers; Election of Directors; Appointment of Certain Officers; Compensatory Arrangements of Certain Officers.', 'Item5.02.Departure of Directors or Certain Officers; Election of Directors; Appointment of Certain Officers; Compensatory Arrangements of Certain Officers.(b)On December 31, 2019, Glenn Pennycook informed Belden Inc. (the “Company”) of his intention to retire as Executive Vice President – Enterprise Solutions effective February 28, 2020. Mr. Pennycook joined the Company in 2008 and served in various leadership positions within the Company’s Enterprise business over that time. The terms of Mr. Pennycook’s retirement will be governed by the Executive Employment Agreement entered into with the Company in May 2013 and filed as Exhibit 10.1 to the Company’s Form 10-Q filed on August 8, 2013.'],
 ('CASY',
  '0000726958-20-000127'): ['Item2.02. Results of Operations and Financial Condition. On Decemb

There are still some noises in the content part, will improve the data preprocessing part.

Now lets just keep the row with more "Item" in the content.

In [12]:
def count_items(content):
    return content.count("Item")

In [13]:
duplicated_rows['item_count'] = duplicated_rows['Content'].apply(count_items)
duplicated_rows

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  duplicated_rows['item_count'] = duplicated_rows['Content'].apply(count_items)


Unnamed: 0,Ticker,Accession Number,Date,Content,item_count
745,EHC,0000785161-22-000042,2022-10-13,Item1.01. Entry into a Material Definitive Agr...,3
746,EHC,0000785161-22-000042,2022-10-13,Item1.01 of Form-8-K. A copy of any omitted Sc...,1
1063,UNM,0000005513-20-000031,2020-02-24,Item5.03Amendments to Articles of Incorporatio...,1
1064,UNM,0000005513-20-000031,2020-02-24,Item9.01Financial Statements and Exhibits.(d) ...,1
1751,RYN,0000052827-21-000109,2021-05-20,Item5.07. Submission of Matters to a Vote of S...,1
...,...,...,...,...,...
6595,VAL,0000314808-22-000121,2022-10-31,Item7.01 Regulation FD DisclosureThe Fleet Sta...,3
6596,VAL,0000314808-22-000126,2022-11-01,Item2.02 Results of Operations and Financial C...,2
6597,VAL,0000314808-22-000126,2022-11-01,Item2.02 Results of Operations and Financial C...,4
6600,VAL,0000314808-21-000035,2021-05-03,Item7.01 Regulation FD Disclosure2Item 9.01 Fi...,2


In [14]:
sorted_duplicated_rows = duplicated_rows.sort_values(by=['Ticker', 'Accession Number', 'item_count'], ascending=[True, True, False])
duplicated_rows

Unnamed: 0,Ticker,Accession Number,Date,Content,item_count
745,EHC,0000785161-22-000042,2022-10-13,Item1.01. Entry into a Material Definitive Agr...,3
746,EHC,0000785161-22-000042,2022-10-13,Item1.01 of Form-8-K. A copy of any omitted Sc...,1
1063,UNM,0000005513-20-000031,2020-02-24,Item5.03Amendments to Articles of Incorporatio...,1
1064,UNM,0000005513-20-000031,2020-02-24,Item9.01Financial Statements and Exhibits.(d) ...,1
1751,RYN,0000052827-21-000109,2021-05-20,Item5.07. Submission of Matters to a Vote of S...,1
...,...,...,...,...,...
6595,VAL,0000314808-22-000121,2022-10-31,Item7.01 Regulation FD DisclosureThe Fleet Sta...,3
6596,VAL,0000314808-22-000126,2022-11-01,Item2.02 Results of Operations and Financial C...,2
6597,VAL,0000314808-22-000126,2022-11-01,Item2.02 Results of Operations and Financial C...,4
6600,VAL,0000314808-21-000035,2021-05-03,Item7.01 Regulation FD Disclosure2Item 9.01 Fi...,2


In [15]:
rows_to_keep = sorted_duplicated_rows.drop_duplicates(subset=['Ticker', 'Accession Number'], keep='first')
rows_to_keep = rows_to_keep.drop(columns=['item_count'])
rows_to_keep

Unnamed: 0,Ticker,Accession Number,Date,Content
5307,BDC,0000913142-20-000002,2020-01-07,Item5.02Departure of Directors or Certain Offi...
5169,CASY,0000726958-20-000127,2020-12-07,Item2.02. Results of Operations and Financial ...
2306,CCK,0001219601-20-000003,2020-02-04,Item2.02. RESULTS OF OPERATIONS AND FINANCIAL ...
2331,CCK,0001219601-20-000014,2020-04-20,Item2.02. RESULTS OF OPERATIONS AND FINANCIAL ...
2335,CCK,0001219601-20-000024,2020-07-20,Item2.02. RESULTS OF OPERATIONS AND FINANCIAL ...
...,...,...,...,...
6597,VAL,0000314808-22-000126,2022-11-01,Item2.02 Results of Operations and Financial C...
1960,WEX,0001309108-20-000007,2020-01-24,Item9.01 of Form 8-K and Rule 3-05 of Regulati...
1964,WEX,0001309108-20-000150,2020-06-29,Item1.01. Entry into a Material Definitive Agr...
2991,ZI,0001794515-21-000034,2021-02-02,Item2.03 Creation of a Direct Financial Obliga...


In [16]:
df_cleaned_final = pd.concat([df_cleaned.drop(duplicated_rows.index), rows_to_keep])
df_cleaned_final

Unnamed: 0,Ticker,Accession Number,Date,Content
0,LANC,0000057515-22-000012,2022-08-17,"Item8.01 Other EventsOn August 17, 2022, Lanca..."
1,LANC,0000057515-22-000024,2022-11-10,Item5.07 Submission of Matters to a Vote of Se...
2,LANC,0000057515-20-000023,2020-08-27,Item2.02 Results of Operations and Financial C...
3,LANC,0000057515-21-000020,2021-11-03,Item2.02 Results of Operations and Financial C...
4,LANC,0000057515-20-000014,2020-05-05,Item2.02 Results of Operations and Financial C...
...,...,...,...,...
6597,VAL,0000314808-22-000126,2022-11-01,Item2.02 Results of Operations and Financial C...
1960,WEX,0001309108-20-000007,2020-01-24,Item9.01 of Form 8-K and Rule 3-05 of Regulati...
1964,WEX,0001309108-20-000150,2020-06-29,Item1.01. Entry into a Material Definitive Agr...
2991,ZI,0001794515-21-000034,2021-02-02,Item2.03 Creation of a Direct Financial Obliga...


In [17]:
# Sorting the duplicated_rows DataFrame based on Ticker and then Date
df_cleaned_final = df_cleaned_final.sort_values(by=['Ticker', 'Date'])

df_cleaned_final

Unnamed: 0,Ticker,Accession Number,Date,Content
1695,ACIW,0000935036-20-000005,2020-02-27,Item2.02. Results of Operation and Financial C...
1690,ACIW,0000935036-20-000011,2020-04-27,Item2.02. Results of Operation and Financial C...
1691,ACIW,0000935036-20-000016,2020-05-07,Item2.02. Results of Operation and Financial C...
1692,ACIW,0000935036-20-000029,2020-08-06,Item2.02. Results of Operation and Financial C...
1699,ACIW,0000935036-20-000040,2020-11-05,Item2.02. Results of Operation and Financial C...
...,...,...,...,...
2980,ZI,0001794515-22-000078,2022-05-02,Item2.02 Results of Operations and Financial C...
3007,ZI,0001794515-22-000095,2022-05-19,Item3.03. Material Modification to Rights of S...
2985,ZI,0001794515-22-000116,2022-06-30,Item5.02 Departure of Directors or Certain Off...
2989,ZI,0001794515-22-000126,2022-08-01,Item2.02 Results of Operations and Financial C...


In [18]:
df_cleaned_final[df_cleaned_final.duplicated(subset=['Ticker', 'Accession Number'], keep=False)]

Unnamed: 0,Ticker,Accession Number,Date,Content


No duplicate rows now!!!

In [19]:
df_cleaned_final

Unnamed: 0,Ticker,Accession Number,Date,Content
1695,ACIW,0000935036-20-000005,2020-02-27,Item2.02. Results of Operation and Financial C...
1690,ACIW,0000935036-20-000011,2020-04-27,Item2.02. Results of Operation and Financial C...
1691,ACIW,0000935036-20-000016,2020-05-07,Item2.02. Results of Operation and Financial C...
1692,ACIW,0000935036-20-000029,2020-08-06,Item2.02. Results of Operation and Financial C...
1699,ACIW,0000935036-20-000040,2020-11-05,Item2.02. Results of Operation and Financial C...
...,...,...,...,...
2980,ZI,0001794515-22-000078,2022-05-02,Item2.02 Results of Operations and Financial C...
3007,ZI,0001794515-22-000095,2022-05-19,Item3.03. Material Modification to Rights of S...
2985,ZI,0001794515-22-000116,2022-06-30,Item5.02 Departure of Directors or Certain Off...
2989,ZI,0001794515-22-000126,2022-08-01,Item2.02 Results of Operations and Financial C...


In [20]:
df_cleaned_final.to_csv("df_item_cleaned.csv")