# Into the future dataset

Time Series forecasting techniques are very important for a data scientist to master as time series data occur in every domain from medical, stock market, climate change prediction, and so on.

# # Import Libraries

In [21]:
import numpy as np
import pandas as pd
import os

In [22]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/into-the-future/test.csv
/kaggle/input/into-the-future/train.csv
/kaggle/input/traintest-dataset/test.csv
/kaggle/input/traintest-dataset/train.csv


# # Importing csv files

In [23]:
train = pd.read_csv('../input/into-the-future/train.csv')
test = pd.read_csv('../input/into-the-future/test.csv')

# # Listing the 1st 10 data from the train dataset

In [24]:
train.head(10)

Unnamed: 0,id,time,feature_1,feature_2
0,0,2019-03-19 00:00:00,735.740043,54479.540513
1,1,2019-03-19 00:00:10,734.102947,47888.033714
2,2,2019-03-19 00:00:20,730.060336,47700.882325
3,3,2019-03-19 00:00:30,725.609742,47790.094648
4,4,2019-03-19 00:00:40,724.32848,47808.402381
5,5,2019-03-19 00:00:50,723.715893,47810.517411
6,6,2019-03-19 00:01:00,720.406875,47835.758121
7,7,2019-03-19 00:01:10,712.682269,47806.938481
8,8,2019-03-19 00:01:20,711.509741,47812.099044
9,9,2019-03-19 00:01:30,707.025594,47821.540514


# #  Listing the 1st 10 data from the test dataset

In [25]:
test.head(10)

Unnamed: 0,id,time,feature_1
0,564,2019-03-19 01:34:00,423.064004
1,565,2019-03-19 01:34:10,423.342749
2,566,2019-03-19 01:34:20,423.181186
3,567,2019-03-19 01:34:30,421.275243
4,568,2019-03-19 01:34:40,422.208444
5,569,2019-03-19 01:34:50,422.616254
6,570,2019-03-19 01:35:00,425.829766
7,571,2019-03-19 01:35:10,426.24383
8,572,2019-03-19 01:35:20,426.042621
9,573,2019-03-19 01:35:30,424.733116


# # Knowing the shape of train and test dataset

In [26]:
print("The train data shape",train.shape)

The train data shape (564, 4)


In [27]:
print("The test data shape", test.shape)

The test data shape (375, 3)


# # The information of the train and test dataset

In [28]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 564 entries, 0 to 563
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   id         564 non-null    int64  
 1   time       564 non-null    object 
 2   feature_1  564 non-null    float64
 3   feature_2  564 non-null    float64
dtypes: float64(2), int64(1), object(1)
memory usage: 17.8+ KB


In [29]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 375 entries, 0 to 374
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   id         375 non-null    int64  
 1   time       375 non-null    object 
 2   feature_1  375 non-null    float64
dtypes: float64(1), int64(1), object(1)
memory usage: 8.9+ KB


# # Checking the null values present in both the datasets

In [30]:
train.isnull()

Unnamed: 0,id,time,feature_1,feature_2
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,False,False
4,False,False,False,False
...,...,...,...,...
559,False,False,False,False
560,False,False,False,False
561,False,False,False,False
562,False,False,False,False


In [31]:
test.isnull()

Unnamed: 0,id,time,feature_1
0,False,False,False
1,False,False,False
2,False,False,False
3,False,False,False
4,False,False,False
...,...,...,...
370,False,False,False
371,False,False,False
372,False,False,False
373,False,False,False


# # Describe the datasets

In [32]:
train.describe

<bound method NDFrame.describe of       id                 time   feature_1     feature_2
0      0  2019-03-19 00:00:00  735.740043  54479.540513
1      1  2019-03-19 00:00:10  734.102947  47888.033714
2      2  2019-03-19 00:00:20  730.060336  47700.882325
3      3  2019-03-19 00:00:30  725.609742  47790.094648
4      4  2019-03-19 00:00:40  724.328480  47808.402381
..   ...                  ...         ...           ...
559  559  2019-03-19 01:33:10  423.726707  54115.388787
560  560  2019-03-19 01:33:20  424.278568  54127.830664
561  561  2019-03-19 01:33:30  420.127639  54092.561194
562  562  2019-03-19 01:33:40  421.291993  53994.714950
563  563  2019-03-19 01:33:50  423.257581  53863.615475

[564 rows x 4 columns]>

In [33]:
test.describe

<bound method NDFrame.describe of       id                 time   feature_1
0    564  2019-03-19 01:34:00  423.064004
1    565  2019-03-19 01:34:10  423.342749
2    566  2019-03-19 01:34:20  423.181186
3    567  2019-03-19 01:34:30  421.275243
4    568  2019-03-19 01:34:40  422.208444
..   ...                  ...         ...
370  934  2019-03-19 02:35:40  432.292203
371  935  2019-03-19 02:35:50  433.661272
372  936  2019-03-19 02:36:00  435.686429
373  937  2019-03-19 02:36:10  444.585521
374  938  2019-03-19 02:36:20  450.836415

[375 rows x 3 columns]>

# # Data analysis

In [34]:
#Creating the dataframe
#Train the data for feature_1
from sklearn.linear_model import LinearRegression
x1=train['feature_1']
x_train = pd.DataFrame(x1)
x_train.head()

Unnamed: 0,feature_1
0,735.740043
1,734.102947
2,730.060336
3,725.609742
4,724.32848


In [35]:
#Creating the dataframe
#Train the data for feature_2
y1 = train['feature_2']
y_train = pd.DataFrame(y1)
y_train.head()

Unnamed: 0,feature_2
0,54479.540513
1,47888.033714
2,47700.882325
3,47790.094648
4,47808.402381


In [36]:
#Fit the data model
lr= LinearRegression()
lr.fit(x_train,y_train)

LinearRegression()

# # Data splitting

In [37]:
from sklearn.model_selection import train_test_split

In [38]:
#Splitting data to evaluate model
X_train,X_test,y_train,y_test=train_test_split(x_train,y_train,test_size=0.2,random_state=0)

# # Data Prediction

In [39]:
#Saving the prediction values in the variables
y_test1=lr.predict(X_test)
y_train1=lr.predict(X_train)

In [40]:
from sklearn.metrics import r2_score

In [41]:
r2_score(y_test,y_test1)

0.9062886676348167

In [42]:
r2_score(y_train,y_train1)

0.8713194475850381

In [47]:
x = test['feature_1']
x = x.values.reshape(-1,1)
y = lr.predict(x)
print(y)

[[54604.00390246]
 [54598.14453381]
 [54601.54067274]
 [54641.60463632]
 [54621.98823341]
 [54613.4158431 ]
 [54545.86608096]
 [54537.16223888]
 [54541.39175023]
 [54568.91826847]
 [54569.41144295]
 [54777.18494592]
 [54586.57855754]
 [54603.95498835]
 [54609.24638373]
 [54613.41031558]
 [54619.65365698]
 [54629.31267628]
 [54642.17511015]
 [54973.90805646]
 [54673.77103108]
 [54652.79145865]
 [54657.80005997]
 [54679.33271418]
 [54686.43738999]
 [54695.2063284 ]
 [54705.85468608]
 [54713.27282563]
 [54724.5219445 ]
 [54723.3556973 ]
 [54735.63286386]
 [54729.29253   ]
 [54734.87040423]
 [54764.43682949]
 [54752.68029294]
 [54740.22969452]
 [54775.2401482 ]
 [54784.80072356]
 [54802.8847886 ]
 [54795.34384758]
 [54798.17182801]
 [54804.73384771]
 [54805.9710249 ]
 [54788.31233177]
 [54834.25840746]
 [54839.84600794]
 [54830.00705317]
 [54849.78049178]
 [54838.84482589]
 [54847.21874243]
 [54862.32466412]
 [54872.97150994]
 [54863.53365101]
 [54862.78927535]
 [55032.06087455]
 [55005.29

# # Solution: predicted values

Now we are almost done. We have predicted the values. Hence, need to store the values in the train_test_data variable

In [48]:
# store the predicted y values which is the feature_2 predicted values
train_test_data = pd.read_csv('../input/into-the-future/test.csv')
train_test_data['feature_2'] = y

In [50]:
#Storing the predicted result in a .csv file named as "solution.csv"
#For checking 1st 5 values are shown.
train_test_data.to_csv('solution.csv' , index=False, header =1)
train_test_data.head()

Unnamed: 0,id,feature_2
0,564,54604.003902
1,565,54598.144534
2,566,54601.540673
3,567,54641.604636
4,568,54621.988233
