In [1]:
import sqlite3
import pandas as pd
import json
import os
import matplotlib.pyplot as plt
import numpy as np
from IPython.display import Markdown, display
import calendar
from math import pi
from bokeh.plotting import figure, show, output_notebook, ColumnDataSource


# Allow Markdown characters in the print statement
# this can make the output more print ready
def printmd(string):
    display(Markdown(string))

In [2]:
conn = sqlite3.connect("twitterTesla.db")
c = conn.cursor()

# Create tesla table to hold Elon Musk's tweets

sql_create = """
CREATE TABLE IF NOT EXISTS tweets (
    id_str TEXT PRIMARY KEY,
    created_at INTEGER,
    full_text TEXT
);
"""

# remove any data from a previous run
sql_delete = "DELETE FROM tweets"

try:
    c.execute(sql_create)
    c.execute(sql_delete)
    conn.commit()
finally:
    conn.rollback()
    c.close()
    conn.close()

In [3]:
conn = sqlite3.connect("twitterTesla.db")
c = conn.cursor()

# Create tesla table to hold stock prices

sql_create = """
CREATE TABLE IF NOT EXISTS tesla (
    id INTEGER PRIMARY KEY,
    date TEXT,
    open DECIMAL(6,2),
    close DECIMAL(6,2),
    high DECIMAL(6,2),
    low DECIMAL(6,2)
);
"""

# remove any data from a previous run
sql_delete = "DELETE FROM tesla"

try:
    c.execute(sql_create)
    c.execute(sql_delete)
    conn.commit()
finally:
    conn.rollback()
    c.close()
    conn.close()

In [4]:
# Sqlite db file will continute to grow in size after any deletes or drops
# This will optimize the database file and reduce its size
# You can also just remove the db file and recreate.
# http://www.sqlitetutorial.net/sqlite-vacuum/
conn = sqlite3.connect("twitterTesla.db")
conn.execute("VACUUM")

<sqlite3.Cursor at 0x119a050a0>

In [5]:
conn = sqlite3.connect("twitterTesla.db")

# load stock into sqlite table
filepath = 'csv/tslaquotes.csv'

# read csv file
stock_df = pd.read_csv(filepath, parse_dates=['date'], usecols=['date', 'close', 'open', 'high', 'low'])

# drop any time values
stock_df['date'] = stock_df['date'].dt.date

# append the data to the 'tesla' table in the 'teslaTwitter.db'
stock_df.to_sql("tesla", conn, if_exists = "append", index=False)

In [6]:
conn = sqlite3.connect("twitterTesla.db")

# Pull out all of the json and put it into a tweets array to iterate through and create the data frame with
with open(os.path.join('json/', 'musktweets.json')) as json_file:
    tweets = []
    for line in json_file:
        tweets.append(json.loads(line))

tweet_df = pd.DataFrame(tweets, columns=['id_str','created_at','full_text'])

# Drop time values and change date format of %Y-%m-%d to match stock quotes
tweet_df['created_at'] = pd.to_datetime(tweet_df['created_at'], format='%a %b %d %H:%M:%S +0000 %Y', utc=True)
tweet_df['created_at'] = pd.DatetimeIndex(tweet_df.created_at).date

# append the data to the 'tweets' table in 'teslaTwitter.db'
tweet_df.to_sql("tweets", conn, if_exists="append", index=False)

In [7]:
conn = sqlite3.connect("twitterTesla.db")
c = conn.cursor()


# SQL STATEMENTS
total_count_sql = """
SELECT count(*) as total_count
FROM tweets
"""
tweet_date_span = """
SELECT MIN(created_at) as min_date, MAX(created_at) as max_date 
FROM tweets
"""
stock_date_span = """
SELECT MIN(date) as min_date, MAX(date) as max_date 
FROM tesla
"""
month_counts = """
SELECT strftime('%m', created_at) as month, strftime('%Y', created_at) as year, count(*) as month_count, strftime('%Y-%m', created_at) as full_date
FROM tweets
GROUP BY month, year
ORDER BY year, month
"""
stock_quotes_sql = """
SELECT *
FROM tesla a
WHERE a.date >= "2016-10-10" AND a.date <= "2018-09-11"
ORDER BY date
"""
grouped_stock_quotes_sql = """
SELECT AVG(close - open) as average_daily_change, strftime('%m', date) as month, strftime('%Y', date) as year
FROM tesla
WHERE date >= "2016-10-10" AND date <= "2018-09-11"
GROUP BY month, year
ORDER BY year, month
"""


# EXECUTE THE QUERIES
total_count = c.execute(total_count_sql).fetchone()[0]
twitter_date_span = c.execute(tweet_date_span).fetchone()
tesla_date_span = c.execute(stock_date_span).fetchone()
month_count = c.execute(month_counts).fetchall()
stock_quotes_command = c.execute(stock_quotes_sql).fetchall()
monthly_stock_command = c.execute(grouped_stock_quotes_sql).fetchall()


# FORMATTED PRINT STATEMENTS
printmd(f"### Total Tweets: {total_count:,}")
printmd(f"""
### Date Span of Tweets
- Earliest Tweet: {twitter_date_span[0]}
- Latest Tweet: {twitter_date_span[1]}
""")
printmd(f"""
### Date Span of Tesla Quotes
- Earliest Quote: {tesla_date_span[0]}
- Latest Quote: {tesla_date_span[1]}
""")
print_records_by_month = """### Tweets by Month
|   Month   |    Year   |  Tweets   |
|-----------|-----------|-----------|
"""
for row in month_count:
    print_records_by_month += f"| {row[0]} | {row[1]} |{row[2]:,} |\n"

printmd(print_records_by_month)

# Add a chart of tweets by month

tweetcountdf = pd.DataFrame(month_count, columns=['month','year','month_count','full_date'])
group = tweetcountdf.groupby(['full_date'])

p = figure(plot_height=500, x_range=group, title="Tweets by Month",
           toolbar_location=None, tools="")

p.vbar(x='full_date', top='month_count', width=1, source=tweetcountdf,
       line_color='black', fill_color='#1DA1F2')

p.y_range.start = 0
p.xgrid.grid_line_color = None
p.xaxis.axis_label = "Year - Month"
p.yaxis.axis_label = "Number of Tweets"
p.xaxis.major_label_orientation = 1.2
p.outline_line_color = None

output_notebook()
show(p)

### Total Tweets: 3,204


### Date Span of Tweets
- Earliest Tweet: 2016-10-10
- Latest Tweet: 2018-09-11



### Date Span of Tesla Quotes
- Earliest Quote: 2015-10-22
- Latest Quote: 2018-11-13


### Tweets by Month
|   Month   |    Year   |  Tweets   |
|-----------|-----------|-----------|
| 10 | 2016 |62 |
| 11 | 2016 |82 |
| 12 | 2016 |50 |
| 01 | 2017 |107 |
| 02 | 2017 |135 |
| 03 | 2017 |117 |
| 04 | 2017 |46 |
| 05 | 2017 |131 |
| 06 | 2017 |214 |
| 07 | 2017 |133 |
| 08 | 2017 |138 |
| 09 | 2017 |55 |
| 10 | 2017 |92 |
| 11 | 2017 |43 |
| 12 | 2017 |102 |
| 01 | 2018 |57 |
| 02 | 2018 |101 |
| 03 | 2018 |122 |
| 04 | 2018 |93 |
| 05 | 2018 |419 |
| 06 | 2018 |399 |
| 07 | 2018 |309 |
| 08 | 2018 |168 |
| 09 | 2018 |29 |


In [8]:
stock_quotes_command_df = pd.DataFrame(stock_quotes_command, columns=['index','date','close','open','high','low'])

df = pd.DataFrame(stock_quotes_command_df)
df["date"] = pd.to_datetime(df["date"])

mids = (df.open + df.close)/2
spans = abs(df.close-df.open)

inc = df.close > df.open
dec = df.open > df.close
w = 12*60*60*1000 # half day in ms

TOOLS = "pan,wheel_zoom,box_zoom,reset,save"

p = figure(x_axis_type="datetime", tools=TOOLS,
           plot_width=1000, toolbar_location="left", title="Tesla Candlestick")

# Adjust labels to 45 degree angle
p.xaxis.major_label_orientation = pi/4
p.grid.grid_line_alpha=0.3

# Create candlesticks
p.segment(df.date, df.high, df.date, df.low, color="black")
p.rect(df.date[dec], mids[dec], w, spans[dec], 
       fill_color="#D5E1DD", line_color="black")
p.rect(df.date[inc], mids[inc], w, spans[inc], 
       fill_color="#F2583E", line_color="black")

printmd(f"### Tesla Candlesticks by Day")

printmd(f"""
##### Use tools to the left of the chart to:
- pan
- zoom in on a selection
- zoom via scrolling
- resetting the chart
(respectively)
""")

output_notebook()
show(p)


### Tesla Candlesticks by Day


##### Use tools to the left of the chart to:
- pan
- zoom in on a selection
- zoom via scrolling
- resetting the chart
(respectively)


In [9]:
average_change_df = pd.DataFrame(monthly_stock_command, columns=['average_daily_change','month','year'])

# output to static HTML file
output_notebook()

p = figure(plot_width=400, plot_height=400, title="Correlation Between Tweet Count and Avg Daily $ Change", 
           x_axis_label="Tweet Count", y_axis_label="Average Daily Price Change")

# add a circle renderer with a size, color, and alpha
p.circle(tweetcountdf.month_count, average_change_df.average_daily_change, size=10, color="navy", alpha=.6)

# show the results
show(p)

p = figure(plot_width=400, plot_height=400)

correlation_coefficient = np.corrcoef(tweetcountdf.month_count, average_change_df.average_daily_change)

printmd(f"""
## Correlation Coefficient of Graph:
#### Relationship: {correlation_coefficient[0][1]}

(0 being no relationship, 1 being perfectly correlated)
""")


## Correlation Coefficient of Graph:
#### Relationship: -0.03161186067231575

(0 being no relationship, 1 being perfectly correlated)


# Conclusion

As you can see... it appears that there is virtually no correlation between the number of tweets Elon Musk makes per month and the average daily change in that given month. 10 months have a negative change in price and 14 months have a positive change in price. None of these average price changes are greater than about 2.50 USD. Additionally, the correlation of these datasets is almost nonexistent, being 0.03.



Where I can improve: plot the correlation coefficient as a line of best fit on the graph to more easily explain its meaning to someone who may not know how it is calculated.