In [28]:
import pandas as pd
from sklearn.model_selection import train_test_split
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import requests
import json 
import os

In [50]:
"""

Extract data from CSV files in "Prices" folder and combine into a single Pandas dataframe with ticker, date, price, 1-month price change (%), and 3-month price change (%).

"""

# Defines path to files containing price information (just the "Prices" folder)
folder_path = "Prices"

# Initialize an empty list to store the dataframes
dataframes = []

# Iterate over each file in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".csv"):  # Check if the file is a CSV
        file_path = os.path.join(folder_path, filename)  # Get the path to the file
        
        # Read the CSV file into a pandas dataframe
        df = pd.read_csv(file_path)

        # Add a new column containing the company ticker, which we extract from the filename
        filename_wo_extension = os.path.splitext(filename)[0]
        df['Ticker'] = filename_wo_extension

        condition_one_month = df['Ticker'].shift(-20) == df['Ticker']
        df.loc[condition_one_month, 'OneMonthChange'] = (df['Close'].shift(-20) - df['Close']) / df['Close'] * 100

        condition_three_month = df['Ticker'].shift(-60) == df['Ticker']
        df['ThreeMonthChange'] = (df['Close'].shift(-60) - df['Close']) / df['Close'] * 100
        
        #Creating our y column, Result
        df['Result'] = 0  # Init the column with zeros
        one_month_threshold = 5
        three_month_threshold = 10

        df.loc[(df['OneMonthChange'] > one_month_threshold) & (df['ThreeMonthChange'] > three_month_threshold), 'Result'] = 1
        df.loc[(df['OneMonthChange'] < -one_month_threshold) & (df['ThreeMonthChange'] < -three_month_threshold), 'Result'] = -1
        
        # Append the DataFrame to the list
        dataframes.append(df)

# Concatenate all the DataFrames in the list into a single DataFrame
prices_df = pd.concat(dataframes, ignore_index=True)

# Be careful--the below filters out any features not specified so adding a feature above won't show up if you don't modify the below line.
prices_df = prices_df[["Date", "Close", "Ticker", "OneMonthChange", "ThreeMonthChange", "Result"]]

# prices_df has all the CSV files' contents in a single DataFrame
print(prices_df)

                            Date       Close Ticker  OneMonthChange  \
0      1999-11-18 00:00:00-05:00   26.652401      A        4.403417   
1      1999-11-19 00:00:00-05:00   24.456602      A       16.099074   
2      1999-11-22 00:00:00-05:00   26.652401      A        5.965871   
3      1999-11-23 00:00:00-05:00   24.229450      A       18.906221   
4      1999-11-24 00:00:00-05:00   24.873049      A       21.156744   
...                          ...         ...    ...             ...   
92411  2024-04-25 00:00:00-04:00  246.339996    ADP             NaN   
92412  2024-04-26 00:00:00-04:00  243.070007    ADP             NaN   
92413  2024-04-29 00:00:00-04:00  243.949997    ADP             NaN   
92414  2024-04-30 00:00:00-04:00  241.889999    ADP             NaN   
92415  2024-05-01 00:00:00-04:00  247.330002    ADP             NaN   

       ThreeMonthChange  Result  
0             86.079527       0  
1            100.619218       1  
2            120.454482       1  
3          

Need to merge prices with earnings transcript data here.

In [51]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split

#Will have some placeholder at first to test this
texts = ["good test", "bad test"]

#Make label.
prices_df["Label"] = 1
df.loc[(df['Result'] < 1), 'Label'] = 0

# Split data into training and validation sets

# Load pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize texts

# Convert labels to tensors

# Load pre-trained BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)  # For now, will assume binary classification

# Need to define training arguments here 

# Need to define the trainer here

# Need to fine-tune BERT on our data here
#trainer.train()

# Need to evaluate our work here
#trainer.evaluate()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [32]:
from itertools import combinations
from sklearn.linear_model import LogisticRegression

#LR = LogisticRegression(max_iter=1000)

#Below is extremely straightforward logistic regression code to train the model on the quantitative outputs BERT pumps out
#LR.fit(X_train[cols], y_train)
#print(LR.score(X_train[cols], y_train))

In [None]:
import json
import re

# Example messy text
messy_text = """
Brian Nowak (Morgan Stanley): Thanks for taking my questions. I have two. First one for Sundar. Appreciate the color on the four key priorities. I wanted to dig a little more into the “build and provide the most helpful products and services.” Maybe if you can sort of talk to us about Search. How do you think about the key investment priorities and innovation areas to continue to make Search more and more helpful for your users and your advertisers? And the second one for Philipp maybe. Similar question on YouTube. You’ve done such a great job on innovation around YouTube. Where do you see the largest incremental opportunities for further innovation at YouTube to deliver more outsized value for your advertisers? Thanks.

Sundar Pichai, CEO Alphabet and Google: Thanks. On Search, great question. I still think we are in very early stages. A recent example which I was proud of was when the ship was stuck in the Suez Canal and then it got out. If you asked a question to Google, I think very soon after that we had the right answer. It seems obvious to do, except we need to provide right answers and without giving wrong answers or misinformation for many other things. So to do that is where all our underlying investments go, and that’s how we think about it over the long-term.

BERT last year, I think, was a great example of it. It was one of our biggest quality improvements. And that was based on the Transformer breakthrough from our Google AI team which laid the foundation for it. So we are continuing to invest that way – in the deep technology. As the web is scaling up, there’s more information than ever before. So that’s a big part of what we are doing. Beyond that, there’s a lot of opportunity to improve the user experience. You’ve seen our efforts around Shopping. That’s one aspect of how we are working hard to improve the experience there so – but we are looking at it pretty deeply. Philipp?

Philipp Schindler, SVP and CBO, Google: Yes. On the YouTube side, let me start with our direct response business. Growth was truly exceptional this quarter. DR was practically nonexistent on YouTube a few years ago, and it’s now a large and fast-growing business. And we’re just getting started, in my view. People already, as you know, go to YouTube to decide what they want to buy. And we want to make it easier for them to buy and make the discovery process overall a lot easier. And for creators, we launched new shoppable capabilities so viewers can actually make purchases from their favorite creators directly on YouTube.

Just as an example, as part of our BrandConnect program, Calvin Klein tested these and drove over – I think it was – 200% lift in brand search and sold out multiple products actually. For merchants, they can now bring their product feeds directly into their video campaigns. And I think we’re still scratching the surface on what’s possible really with commercial intent on YouTube. And then there’s, of course, the opportunity to be a major platform for brands. Historical approaches to reaching audiences through, let’s just say, call it, linear TV don’t really work anymore. Advertisers are using YouTube now to reach the audiences they can’t find anywhere else. And remember, more 18-to-49-year-olds are actually watching YouTube than all linear TV combined. And brands are also seeing more incremental reach on YouTube compared to TV.

So we’re starting to see advertisers buy a mix, actually, of awareness and more action-oriented formats. And they are driving reach and results across the funnel from awareness to consideration to action. So we see a lot of really interesting opportunities here.

Brian Nowak (Morgan Stanley): Great. Thank you, both.

Candice (Operator): Thank you. And our next question comes from Doug Anmuth from JP Morgan. Your line is now open.

Doug Anmuth (JP Morgan): Thanks for taking the questions. I have two. First, Ruth, just wanted to ask you about Cloud. You saw some significant benefits just from the change in useful life. But I think in the past, you talked about 1Q perhaps being the biggest loss of the year. I was just curious if that’s still the case, in your view, going forward. And then second, just given the management transition that we’ve seen at Waymo, should we expect any change in terms of how things are operated there going forward? Thank you.

Ruth Porat, CFO Alphabet and Google: Thanks for the question. So in terms of Cloud and overall performance – I think, the main point I would say is, I wouldn’t extrapolate generally from quarter to quarter, given we’re still in the early stages of building the business. We do intend to continue to invest meaningfully in Cloud, given the opportunity. And so, as you said, there were a couple of things that benefited margins in the quarter, both the depreciation expense item, but also lapping the unusually high allowance for credit losses that were recorded back in the first quarter.

So the main takeaway is we’re continuing to invest. We’ll invest aggressively in products and go-to-market, what we’ve talked about quite consistently over time. And as much as operating losses and operating margin will benefit from increased scale over time, at this point, we do remain focused on investing to build the organization for long-term performance.

Sundar Pichai, CEO Alphabet and Google: And, Doug, on Waymo, John is stepping down as CEO. And he’s been planning for this transition, and Dmitri and Tekedra have been working closely with him. And so we’ll continue our investments there. Pretty excited that the fully autonomous experience of Waymo One is available in Phoenix, and we are also accelerating the development of our next-generation Waymo Driver to deploy it in San Francisco. And this past quarter, Waymo began limited rider testing in San Francisco. And so really focused on making sure we make the hard technical progress so that we can operationalize this. And so we will continue executing towards that.

Doug Anmuth (JP Morgan): Okay. Thank you, both.

Candice (Operator): Thank you. And our next question comes from Brent Thill from Jefferies. Your line is now open.

Brent Thill (Jefferies): Thanks. As it relates to some of the harder-hit industries, I’m curious if you could just characterize the shape of the recovery, what you’re seeing across travel and some of the other sectors. And have there been any verticals that you have yet to see recover that may pull out in the second half of the year? Thank you.

Ruth Porat, CFO Alphabet and Google: So overall, what we indicated is that the strong results reflect, in part, lapping the impact that we saw starting late in Q1 of last year and then a pickup in a number of areas. I think the main thing we’d want to leave you with is that we are seeing, in part, an acceleration in the shift to digital. But it’s too early to forecast the extent to which these changes in consumer behavior and advertising spend will endure. There’s some obvious examples. If you think about, for example, the bump in consumption for things like outfitting your home to work from home, obviously, that doesn’t repeat. And so our main thing is we think it’s premature at this point to really assess how durable those consumer behavior trends are.

Brent Thill (Jefferies): In travel specifically, can you just give us any color in terms of what you’re seeing on that front?

Ruth Porat, CFO Alphabet and Google: Nothing more to add. Philipp had a couple of comments about some of the areas where we’re trying to innovate to be helpful to our partners. But beyond that, nothing to add.

Brent Thill (Jefferies): Great. Thank you.

Candice (Operator): Thank you. And our next question comes from Justin Post from Bank of America. Your line is now open.

Justin Post (Bank of America Merrill Lynch): Maybe one for Philipp and one for Ruth. First, Philipp, you’ve mentioned a couple times the durability of the improvement is tough to gauge. Maybe you could help us understand what the key drivers of Search are that you’re thinking about over the next couple years. Is it queries? Product improvements? Certain changes in verticals like Shopping? How are you thinking about driving Search growth? And then maybe for Ruth, models showed great efficiency last year on the cost side and margins. Anything you’re learning or experiences during the pandemic that we can think about post-pandemic on cost efficiencies or things like that? Thank you.

Philipp Schindler, SVP and CBO, Google: Yes, thank you. Thank you so much for the question. I usually look at the different components of Search as basically four key drivers. The first one, obviously, being the queries. So are we really the best place for users to turn to when they need information? The second one is, I would call it, ads coverage. So what percent of queries is really commercial and then what percentage are we actually covering with ads? And then we need to ask ourselves, do both of these have upside?

The third one is click-through rates. Are individual ad click-through rates close to being optimized? Is there more we can do here by just delivering better creatives, better ads, better answers? To what extent can we deploy next-generation machine learning here? And then the last one is obviously the CPC, right? How much is someone willing to bid for a click on their ad? And this is obviously to a large extent driven by the quality of traffic we’re sending. And then conversion rate is a big driver of this. So working very closely with our partners, advertisers and so on across the world to help them optimize their conversion rates and their ROI. Those are really the four big components, and I’m excited about all four of them actually.

Ruth Porat, CFO Alphabet and Google: And in terms of your question on efficiency, I appreciate the question. I think at the highest level, the approach is unchanged. Our approach on investing and capital allocation is, first and foremost, to support long-term growth with financially sustainable businesses. It’s about being sharper within product areas and then making sure we’re investing in what I keep referring to as operational excellence, things like our technical infrastructure, systems to improve productivity, to improve velocity of our product teams and then the very important efforts around privacy and security and content moderation.

And I think, to your question, the experiences of this past year underscored, really, the value of having made those investments to protect and support operational excellence. It really served us well, and customers, in our ability to deliver throughout this period of time. So that framework is unaltered. I think that part of what you’re seeing in the first quarter – I’ve said it a couple times now, but are some notable items in the quarter. The lapping of the allowance for credit losses, the benefit from depreciation life. And then there were certain things that were due to COVID – just the lower impact for things like T&E and marketing.

And so the main point is we will continue to invest for long-term growth. I’ve said that in both areas, Google Services and Cloud. And we have continued to maintain that framework that you referenced about looking for efficiencies where they are but ensuring that we can deliver for users and customers.

Justin Post (Bank of America Merrill Lynch): Great. Thank you.

Candice (Operator): Thank you. And our next question comes from Colin Sebastian from Baird. Your line is now open.

Colin Sebastian (Baird): Great. Thanks. Good afternoon. Sundar, first, you’ve highlighted for years that machine learning is clearly a strength and differentiation of the overall platform, including in Cloud services, where we’re also seeing competitors focus more on their capabilities here. So I’m wondering if you can talk about the pace of change around data science and how Google can sustain its competitive advantage in those areas. And then, Philipp, I wanted to follow up on the momentum in Search that you attribute to Google Shopping. Is it fair to say that the shift to free product listings has led to the desired increase in retail advertising across the platform, or are there other reasons beyond the pandemic that you attribute for that success? Thanks.

Sundar Pichai, CEO Alphabet and Google: Colin, thanks. And, obviously, as we are thinking about AI, it all starts with foundational R&D we do. I think we are one of the largest R&D investors in AI in the world. And so thinking ahead and doing that, and we’re doing it across all the foundational areas. And we are taking many diverse approaches. So as we make breakthroughs – I earlier spoke about Transformers and how that translated as BERT to improve search quality.

And similarly, we are very committed to taking the AI improvements and bringing it through our GCP offerings to our enterprise customers as well. So it’s an approach we are deeply committed to. And we’re thinking at it with all layers of the stack. So this is why you see us work hard on TPUs and we think about the toolchain for developers on top of all that. And so – and I think when I look at the progress ahead, I think there’s a lot more progress coming down the pipe, and so I’m pretty excited. And it’s why I feel Google – GCP will be differentiated over time as our competitive advantage plays out here.

Philipp Schindler, SVP and CBO, Google: Yes. And on the Shopping side, look, it’s been a year since we brought Bill on board, Bill Ready, and we pivoted our Shopping strategy to better support retailers and consumers, trying to really build an open retail ecosystem. And we’re pleased with the progress we’re making. As you said, free listings and zero commissions have actually lowered barriers for online retail.

Shopping Ads continue to be a powerful way for resellers to promote their products, and the combination of free and paid is a meaningful one. We had a set of new partnerships with Shopify and PayPal that are giving retailers a lot more choice. And we will continue to simplify the, let me call it, end-to-end user and merchant experience, of course. In particular, we are trying to streamline and working hard to streamline the backend experience for merchants, especially for hybrid retailers, so retailers that play in both brick-and-mortar and in digital. And overall, we want to make it much, much easier for retailers to get started on Google and have their information appear across surfaces. And I mentioned the overall strength in retail before. So thank you.

Colin Sebastian (Baird): Thanks, guys.

Candice (Operator): Thank you. And our next question comes from Mark Mahaney from ISI. Your line is now open.

Mark Mahaney (Evercore ISI): Thanks. I wanted to ask about your attempts to retain advertisers. And I ask it this way. I think we’ve had record numbers of new business formations in the country and around the world on the unfortunate impact of COVID. But I think that’s – my guess is it’s been a huge tailwind for your business. At the same time, we’ve had this real tipover, I think, of linear TV ad budgets in the back half of the year to online channels like YouTube. So talk about these new advertisers that you’ve brought onto the Google platform, what you’ve been able to do, how confident you are in your ability to retain them, your advertiser retention strategy. Thanks a lot.

Philipp Schindler, SVP and CBO, Google: So I can take this. I mean, a lot of the new advertisers that you’re referring to are obviously SMBs. And there’s no doubt that this has been a challenging year for SMBs. The pandemic has disrupted how many of them connect with their customers. But, frankly, the pandemic has also been a catalyst for key consumer trends, obviously creating a lot of new opportunities for small businesses. And obviously consumers are spending more time online. They’re buying more online. They were willing to try new brands. And they’re eager to support local businesses, SMBs.

So searches for “support local businesses” are up significantly since last year. And we’ve been focused, really, on helping SMBs with simpler tools so they can actually embrace digital a lot faster. And that’s where we have really invested over the year – making everything simpler. We had a very wide range of solutions to help them get online, get discovered across all of our key products – Search, Maps, YouTube and so on. And there’s multiple, multiple fascinating stories from them coming back to us. And we see this positively reflected in our rates here as well.

Mark Mahaney (Evercore ISI): Okay. Thank you, Philipp.

Candice (Operator): Thank you. And our next question comes from Michael Nathanson from MoffettNathanson. Your line is now open.

Michael Nathanson (MoffettNathanson): Thank you. I have two, one for Philipp, one for Ruth. Philipp, on the questions on Search, when we step back, which categories, which geographies do you think you’re still underrepresented as a percentage of marketing spending – where we could see potentially even more lift to come? And then to Ruth, we always asked in the past about capex spending – we know the change in useful life. But I wonder, has this pandemic changed maybe your approach to the office space that you bought and thinking about how the company’s going to deploy capital in terms of space going forward and how you think about the future of capex in the next couple years based on post-pandemic?

Ruth Porat, CFO Alphabet and Google: Philipp seems to be on mute. I’ll go ahead and start on the capex question.

Michael Nathanson (MoffettNathanson): Okay. Okay. Thanks, Ruth.

Ruth Porat, CFO Alphabet and Google: So in terms of capex, I think – I’ll address two parts. You asked about office facilities, but I do think it’s important to note, we are continuing to invest in our technical infrastructure, and that’s what you saw again here this quarter, and we’ll continue to do so to support growth that we’re seeing in Cloud and Search and Ads and Machine Learning. No change there so you’ll be seeing that, but the core of your question was really about office facilities.

And I think it’s – we’ve been very clear we do value bringing people together in the office. And we’re looking at a hybrid work from home, work from office model. As we look forward at developing our real estate footprint for offices, what we factor into it is, first, we are growing our headcount. We are looking at less density per employee. So even with a hybrid work environment, we will continue to need space. And so we’re continuing to build out our campuses and office facilities.

What you saw in the first quarter was a slightly slower pace of that and a slower pace on fit outs as well as we’re evolving what does this space look like. But we expect to continue to pick up the pace there as we fit out our spaces for this kind of new, reimagined environment. So, yes, we’ll continue to be investing in campuses around the globe as we have been.

Philipp Schindler, SVP and CBO, Google: Yes. And on your first part of your question, look, we’re looking at our business from a very global perspective and are excited about it. Keep in mind, we’re not just addressing above the line marketing budgets from an addressable market perspective. So not just traditional advertising, TV advertising and so on. Below the line budgets are really significant, everything – promotional pricing, product placement, sponsorships and so on and so on.

So there’s this massive acceleration in e-commerce due to the pandemic. Still, more than 80% of commerce is still offline, so there’s a huge opportunity here across the world for us to tap into – into those other budgets. They were really traditionally used in a very different context. So there’s plenty of room for growth here. And I talked about how we look at it from a queries perspective, from a commercial intent perspective. We’re trying to use machine learning really smartly here. But the real focus in the end has to be how do we actually make our partners successful, how do we drive incremental ROI for them? And as long as we continue this, well, I think we should continue to see budgets move our way as well.

Michael Nathanson (MoffettNathanson): Thank you.

Candice (Operator): Thank you. And our final question comes from the line of Brian Fitzgerald from Wells Fargo. Your line is now open.

Brian Fitzgerald (Wells Fargo): Hey, guys, you mentioned the strength in the supply-side products in the Network business. Wondering if you might be able to comment on how the demand-side products are doing? And maybe in a similar vein, some of the changes you’ve made in ad technology over the last few years may have had the effect of drawing some of your advertiser customers more deeply into your tech stack. Wondering if this is also creating a strong onramp in GCP specifically around data analytics products like BigQuery? Thanks.

Ruth Porat, CFO Alphabet and Google: So in terms of overall on Network revenues, as I think I noted briefly in opening comments, what we’re really seeing is the ongoing strength in advertiser spend. Both Philipp and I talked about that. Particularly what we saw was AdMob and Ad Manager and particular strength in App Campaigns. And all of this just underscores what each of us commented on, that the results do reflect what was broad-based strength across our partners’ properties in the first quarter.

Brian Fitzgerald (Wells Fargo): Thanks, Ruth.

Ruth Porat, CFO Alphabet and Google: Thank you."""

# Clean the text by removing unwanted characters
clean_text = re.sub(r'[^\w\s]', '', messy_text)  # Remove punctuation
clean_text = re.sub(r'\n', ' ', clean_text)      # Remove newline characters
clean_text = re.sub(r'\s+', ' ', clean_text)     # Remove extra whitespace

# Split the cleaned text into sentences or chunks as needed
sentences = [sentence.strip() for sentence in clean_text.split('.') if sentence.strip()]

# Convert the sentences into a JSON structure
json_data = {"sentences": sentences}

# Write the JSON data to a file or process it further as needed
with open("cleaned_data.json", "w") as json_file:
    json.dump(json_data, json_file, indent=4)

print("JSON file written successfully!")
