In [None]:
import pandas as pd
from bsedata.bse import BSE
import time
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

github_excel_url = "https://raw.githubusercontent.com/jangid6/Stock-ETL-Project/main/Equity.xlsx"
engine = 'openpyxl' 
Equity = pd.read_excel(github_excel_url, engine = engine)
Equity['Security Code'] = Equity['Security Code'].astype(str)
Equity.head() # Get the list of stocks in Nifty50

Unnamed: 0,Security Code,Issuer Name,Security Id,Security Name,Status,Group,Face Value,ISIN No,Industry,Instrument,Sector Name,Industry New Name,Igroup Name,ISubgroup Name
0,500002,ABB India Limited,ABB,ABB India Limited,Active,A,2.0,INE117A01022,Heavy Electrical Equipment,Equity,Industrials,Capital Goods,Electrical Equipment,Heavy Electrical Equipment
1,500003,Aegis Logistics Ltd.,AEGISLOG,AEGIS LOGISTICS LTD.,Active,A,1.0,INE208C01025,Trading - Gas,Equity,Energy,"Oil, Gas & Consumable Fuels",Gas,Trading - Gas
2,500008,Amara Raja Energy & Mobility Limited,ARE&M,Amara Raja Energy & Mobility Ltd,Active,A,1.0,INE885A01032,Auto Components & Equipments,Equity,Consumer Discretionary,Automobile and Auto Components,Auto Components,Auto Components & Equipments
3,500009,"Ambalal Sarabhai Enterprise Ltd.,",AMBALALSA,AMBALAL SARABHAI ENTERPRISES LTD.,Active,XT,10.0,INE432A01017,Pharmaceuticals,Equity,Healthcare,Healthcare,Pharmaceuticals & Biotechnology,Pharmaceuticals
4,500012,"Andhra Petrochemicals Ltd.,",ANDHRAPET,ANDHRA PETROCHEMICALS LTD.,Active,X,10.0,INE714B01016,Commodity Chemicals,Equity,Commodities,Chemicals,Chemicals & Petrochemicals,Commodity Chemicals


In [None]:

nifty50_stock_symbols = [ "ADANIENT", "ADANIPORTS", "APOLLOHOSP", "ASIANPAINT", "AXISBANK",
    "BAJAJ-AUTO", "BAJFINANCE", "BAJAJFINSV", "BPCL", "BHARTIARTL",
    "BRITANNIA", "CIPLA", "COALINDIA", "DIVISLAB", "DRREDDY", "EICHERMOT",
    "GRASIM", "HCLTECH", "HDFCBANK", "HDFCLIFE", "HEROMOTOCO", "HINDALCO",
    "HINDUNILVR", "ICICIBANK", "ITC", "INDUSINDBK", "INFY", "JSWSTEEL",
    "KOTAKBANK", "LTIM", "LT", "M&M", "MARUTI", "NTPC", "NESTLEIND",
    "ONGC", "POWERGRID", "RELIANCE", "SBILIFE", "SBIN", "SUNPHARMA",
    "TCS", "TATACONSUM", "TATAMOTORS", "TATASTEEL", "TECHM", "TITAN",
    "UPL", "ULTRACEMCO", "WIPRO"
]

nifty50_SqDF= Equity[Equity['Security Id'].isin(nifty50_stock_symbols)].reset_index(drop=True)
nifty50_SqDF.rename(columns={'Group': 'CompanyGroup'}, inplace=True)
nifty50_SqDF.columns = nifty50_SqDF.columns.str.replace(' ', '')

b = BSE(update_codes=True)
result_dfs = []
sqcode_ListNifty50 = nifty50_SqDF['SecurityCode'].values
for sqCode in sqcode_ListNifty50:
    try:
        stock_data = b.getQuote(sqCode)
        stock_df = pd.DataFrame([stock_data])
        result_dfs.append(stock_df)
        time.sleep(0.5)
    except IndexError:
        print(f"IndexError for {sqCode}: Data not available")
        
nifty50DailyTable = pd.concat(result_dfs, ignore_index=True).iloc[:, :-2]
nifty50DailyTable.head(n=2)

Unnamed: 0,companyName,currentValue,change,pChange,updatedOn,securityID,scripCode,group,faceValue,industry,previousClose,previousOpen,dayHigh,dayLow,52weekHigh,52weekLow,weightedAvgPrice,totalTradedValue,totalTradedQuantity,2WeekAvgQuantity,marketCapFull,marketCapFreeFloat
0,Bajaj Finance Limited,7126.15,104.3,1.49,28 Nov 23 | 04:00 PM,BAJFINANCE,500034,A / S&P BSE SENSEX,2.0,Financial Services,7021.85,7022.2,7133.0,7022.2,8190.0,5487.25,7081.82,24.93 Cr.,0.35 Lakh,0.52 Lakh,"4,40,412.52 Cr.","1,93,781.51 Cr."
1,CIPLA LTD.,1192.05,-6.8,-0.57,28 Nov 23 | 04:00 PM,CIPLA,500087,A / S&P BSE 100,2.0,Healthcare,1198.85,1198.85,1202.9,1188.35,1283.0,852.0,1195.82,7.53 Cr.,0.63 Lakh,0.81 Lakh,"96,237.23 Cr.","63,516.57 Cr."


In [None]:
nifty50DailyTable.rename(columns={'group': 'sharegroup'}, inplace=True)
nifty50DailyTable.rename(columns={'52weekHigh': 'fiftytwoweekHigh'}, inplace=True)
nifty50DailyTable.rename(columns={'52weekLow': 'fiftytwoweekLow'}, inplace=True)
nifty50DailyTable.rename(columns={'2WeekAvgQuantity': 'twoWeekAvgQuantity'}, inplace=True)
# Convert 'updatedOn' column to datetime and extract date
nifty50DailyTable['updatedOn'] = pd.to_datetime(nifty50DailyTable['updatedOn'], format='%d %b %y | %I:%M %p', errors='coerce')

# Check if there are any invalid or missing date values
if pd.isna(nifty50DailyTable['updatedOn']).any():
    print("There are invalid or missing date values in the 'updatedOn' column.")
else:
    # Extract date from 'updatedOn' column and convert the column to datetime
    nifty50DailyTable['updatedOn'] = pd.to_datetime(nifty50DailyTable['updatedOn'].dt.date)

if 'totalTradedValueCr' not in nifty50DailyTable.columns:
   # Assuming nifty50DailyTable is your DataFrame
    nifty50DailyTable['totalTradedValueCr'] = pd.to_numeric(nifty50DailyTable['totalTradedValue'].str.replace(',', '').str.replace(' Cr.', '', regex=True), errors='coerce')  # Convert to numeric and handle 'Cr.'
    nifty50DailyTable['totalTradedQuantityLakh'] = pd.to_numeric(nifty50DailyTable['totalTradedQuantity'].str.replace(',', '').str.replace(' Lakh', '', regex=True), errors='coerce')  # Convert to numeric and handle 'Lakh'
    nifty50DailyTable['twoWeekAvgQuantityLakh'] = pd.to_numeric(nifty50DailyTable['twoWeekAvgQuantity'].str.replace(',', '').str.replace(' Lakh', '', regex=True), errors='coerce')  # Convert to numeric and handle 'Lakh'
    nifty50DailyTable['marketCapFullCr'] = pd.to_numeric(nifty50DailyTable['marketCapFull'].str.replace(',', '').str.replace(' Cr.', '', regex=True), errors='coerce')  # Convert to numeric and handle 'Cr.'
    nifty50DailyTable['marketCapFreeFloatCr'] = pd.to_numeric(nifty50DailyTable['marketCapFreeFloat'].str.replace(',', '').str.replace(' Cr.', '', regex=True), errors='coerce')  # Convert to numeric and handle 'Cr.'

    # Drop original columns
    nifty50DailyTable.drop(['totalTradedValue', 'totalTradedQuantity','twoWeekAvgQuantity', 'marketCapFull', 'marketCapFreeFloat'], axis=1, inplace=True)

nifty50DailyTable.head(n=2)

Unnamed: 0,companyName,currentValue,change,pChange,updatedOn,securityID,scripCode,sharegroup,faceValue,industry,previousClose,previousOpen,dayHigh,dayLow,fiftytwoweekHigh,fiftytwoweekLow,weightedAvgPrice,totalTradedValueCr,totalTradedQuantityLakh,twoWeekAvgQuantityLakh,marketCapFullCr,marketCapFreeFloatCr
0,Bajaj Finance Limited,7126.15,104.3,1.49,2023-11-28,BAJFINANCE,500034,A / S&P BSE SENSEX,2.0,Financial Services,7021.85,7022.2,7133.0,7022.2,8190.0,5487.25,7081.82,24.93,0.35,0.52,440412.52,193781.51
1,CIPLA LTD.,1192.05,-6.8,-0.57,2023-11-28,CIPLA,500087,A / S&P BSE 100,2.0,Healthcare,1198.85,1198.85,1202.9,1188.35,1283.0,852.0,1195.82,7.53,0.63,0.81,96237.23,63516.57


In [None]:
# Convert 'updatedOn' column to datetime and extract date
nifty50DailyTable['updatedOn'] = pd.to_datetime(nifty50DailyTable['updatedOn'], format='%d %b %y | %I:%M %p', errors='coerce')

# Check if there are any invalid or missing date values
if pd.isna(nifty50DailyTable['updatedOn']).any():
    print("There are invalid or missing date values in the 'updatedOn' column.")
else:
    # Extract date from 'updatedOn' column and convert the column to datetime
    nifty50DailyTable['updatedOn'] = pd.to_datetime(nifty50DailyTable['updatedOn'].dt.date)

if 'totalTradedValueCr' not in nifty50DailyTable.columns:
   # Assuming nifty50DailyTable is your DataFrame
    nifty50DailyTable['totalTradedValueCr'] = pd.to_numeric(nifty50DailyTable['totalTradedValue'].str.replace(',', '').str.replace(' Cr.', '', regex=True), errors='coerce')  # Convert to numeric and handle 'Cr.'
    nifty50DailyTable['totalTradedQuantityLakh'] = pd.to_numeric(nifty50DailyTable['totalTradedQuantity'].str.replace(',', '').str.replace(' Lakh', '', regex=True), errors='coerce')  # Convert to numeric and handle 'Lakh'
    nifty50DailyTable['twoWeekAvgQuantityLakh'] = pd.to_numeric(nifty50DailyTable['twoWeekAvgQuantity'].str.replace(',', '').str.replace(' Lakh', '', regex=True), errors='coerce')  # Convert to numeric and handle 'Lakh'
    nifty50DailyTable['marketCapFullCr'] = pd.to_numeric(nifty50DailyTable['marketCapFull'].str.replace(',', '').str.replace(' Cr.', '', regex=True), errors='coerce')  # Convert to numeric and handle 'Cr.'
    nifty50DailyTable['marketCapFreeFloatCr'] = pd.to_numeric(nifty50DailyTable['marketCapFreeFloat'].str.replace(',', '').str.replace(' Cr.', '', regex=True), errors='coerce')  # Convert to numeric and handle 'Cr.'

    # Drop original columns
    nifty50DailyTable.drop(['totalTradedValue', 'totalTradedQuantity','twoWeekAvgQuantity', 'marketCapFull', 'marketCapFreeFloat'], axis=1, inplace=True)

nifty50DailyDailyData_spark_df = spark.createDataFrame(nifty50DailyTable)

In [None]:
nifty50DailyDailyData_spark_df.toPandas().head() #You can also use SparkDF.show(), I prefer in .toPandas()

Unnamed: 0,companyName,currentValue,change,pChange,updatedOn,securityID,scripCode,sharegroup,faceValue,industry,previousClose,previousOpen,dayHigh,dayLow,fiftytwoweekHigh,fiftytwoweekLow,weightedAvgPrice,totalTradedValueCr,totalTradedQuantityLakh,twoWeekAvgQuantityLakh,marketCapFullCr,marketCapFreeFloatCr
0,Bajaj Finance Limited,7126.15,104.3,1.49,2023-11-28,BAJFINANCE,500034,A / S&P BSE SENSEX,2.0,Financial Services,7021.85,7022.2,7133.0,7022.2,8190.0,5487.25,7081.82,24.93,0.35,0.52,440412.52,193781.51
1,CIPLA LTD.,1192.05,-6.8,-0.57,2023-11-28,CIPLA,500087,A / S&P BSE 100,2.0,Healthcare,1198.85,1198.85,1202.9,1188.35,1283.0,852.0,1195.82,7.53,0.63,0.81,96237.23,63516.57
2,STATE BANK OF INDIA,564.55,4.25,0.76,2023-11-28,SBIN,500112,A / S&P BSE SENSEX,1.0,Financial Services,560.3,562.5,565.2,561.1,629.65,499.35,563.05,25.55,4.54,5.75,503838.97,216650.76
3,Titan Company Limited,3442.05,43.45,1.28,2023-11-28,TITAN,500114,A / S&P BSE SENSEX,1.0,Consumer Durables,3398.6,3410.05,3445.6,3397.0,3445.6,2268.9,3431.79,8.44,0.25,0.28,305580.44,143622.8
4,DR.REDDY'S LABORATORIES LTD.,5671.25,28.3,0.5,2023-11-28,DRREDDY,500124,A / S&P BSE 100,5.0,Healthcare,5642.95,5643.0,5713.8,5643.0,5986.2,4176.85,5689.58,2.11,0.04,0.16,94590.24,69050.87


In [None]:
from pyspark.sql import SparkSession

# Create a Spark session and please change parameters
spark = SparkSession.builder.appName("Nifty50DailyData").getOrCreate()
jdbcHostname = "mainsqldbserver.database.windows.net" #replace with name of your SQL servername
jdbcDatabase = "nifty50db" ##replace with name of your database
jdbcPort = 1433
jdbcUrl = "jdbc:sqlserver://{0}:{1};database={2}".format(jdbcHostname, jdbcPort, jdbcDatabase)
connectionProperties = {
  "user" : "Enter Your UserName", #replace with your UserName
  "password" : "Enter Your Password", #replace with your password
  "driver" : "com.microsoft.sqlserver.jdbc.SQLServerDriver" 
}
table_name = "nifty50Table"
nifty50_dailydata_exists = False
try:
  spark.read.jdbc(url=jdbcUrl, table=table_name, properties=connectionProperties)
  nifty50_dailydata_exists = True
except:
    columns = [
      "`companyName` VARCHAR(50)",
      "`currentValue` FLOAT",
      "`change` FLOAT",
      "`pChange` FLOAT",
      "`updatedOn` DATE",
      "`securityID` VARCHAR(50)",
      "`scripCode` VARCHAR(50)",
      "`sharegroup` VARCHAR(50)",
      "`faceValue` FLOAT",
      "`industry` VARCHAR(50)",
      "`previousClose` FLOAT",
      "`previousOpen` FLOAT",
      "`dayHigh` FLOAT",
      "`dayLow` FLOAT",
      "`fiftytwoweekHigh` FLOAT",
      "`fiftytwoweekLow` FLOAT",
      "`weightedAvgPrice` FLOAT",
      "`totalTradedQuantityLakh` FLOAT",
      "`totalTradedValueCr` FLOAT",
      "`twoWeekAvgQuantityLakh` FLOAT",
      "`marketCapFullCr` FLOAT",
      "`marketCapFreeFloatCr` FLOAT"
    ]
    drop_query = f"DROP TABLE {table_name}"
    spark.sql(drop_query)
    create_query = f"CREATE TABLE {table_name} ({','.join(columns)})"
    spark.sql(create_query)
    nifty50DailyDailyData_spark_df.createOrReplaceTempView("nifty50dailydata_temp_table")
    nifty50DailyDailyData_spark_df.write.jdbc(url=jdbcUrl, table=table_name, mode="overwrite",
                                            properties=connectionProperties)
    nifty50_dailydata_exists = True

In [None]:
queryMaxDate = f"SELECT MAX(updatedOn) as max_date FROM {table_name}"
queryresult = spark.read.jdbc(url=jdbcUrl, table="({0}) temp".format(queryMaxDate), properties=connectionProperties)
sql_max_date = queryresult.first()[0]
df_max_updatedOn = nifty50DailyTable['updatedOn'].max()
if (sql_max_date == None) or (sql_max_date < df_max_updatedOn):
    nifty50DailyDailyData_spark_df.write.jdbc(url=jdbcUrl, table=table_name, mode="append",
                                            properties=connectionProperties)
    print("[update Completed] Table is updated with latest data")
else:
    print("[No Update Required] Table is already updated with latest data")

[update Completed] Table is updated with latest data
